From 27ee57c93ff00b8a2d6c6dd6b0b3dddda7b43b77 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:17:35 -0700 Subject: mm: memcontrol: report slab usage in cgroup2 memory.stat Show how much memory is used for storing reclaimable and unreclaimable in-kernel data structures allocated from slab caches. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index f0c4bec6565b..e7af4834ffea 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -53,6 +53,8 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_NSTATS, /* default hierarchy stats */ MEMCG_SOCK = MEM_CGROUP_STAT_NSTATS, + MEMCG_SLAB_RECLAIMABLE, + MEMCG_SLAB_UNRECLAIMABLE, MEMCG_NR_STAT, }; @@ -883,6 +885,20 @@ static __always_inline void memcg_kmem_put_cache(struct kmem_cache *cachep) if (memcg_kmem_enabled()) __memcg_kmem_put_cache(cachep); } + +/** + * memcg_kmem_update_page_stat - update kmem page state statistics + * @page: the page + * @idx: page state item to account + * @val: number of pages (positive or negative) + */ +static inline void memcg_kmem_update_page_stat(struct page *page, + enum mem_cgroup_stat_index idx, int val) +{ + if (memcg_kmem_enabled() && page->mem_cgroup) + this_cpu_add(page->mem_cgroup->stat->count[idx], val); +} + #else #define for_each_memcg_cache_index(_idx) \ for (; NULL; ) @@ -928,6 +944,11 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) static inline void memcg_kmem_put_cache(struct kmem_cache *cachep) { } + +static inline void memcg_kmem_update_page_stat(struct page *page, + enum mem_cgroup_stat_index idx, int val) +{ +} #endif /* CONFIG_MEMCG && !CONFIG_SLOB */ #endif /* _LINUX_MEMCONTROL_H */ -- cgit v1.2.3-70-g09d2 From 12580e4b54ba8a1b22ec977c200be0174ca42348 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:17:38 -0700 Subject: mm: memcontrol: report kernel stack usage in cgroup2 memory.stat Show how much memory is allocated to kernel stacks. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroup-v2.txt | 4 ++++ include/linux/memcontrol.h | 3 ++- kernel/fork.c | 10 +++++++++- mm/memcontrol.c | 2 ++ 4 files changed, 17 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index e4e0c1d78cee..e2f4e7948a66 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt @@ -843,6 +843,10 @@ PAGE_SIZE multiple when read back. Amount of memory used to cache filesystem data, including tmpfs and shared memory. + kernel_stack + + Amount of memory allocated to kernel stacks. + slab Amount of memory used for storing in-kernel data diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e7af4834ffea..d6300313b298 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -52,9 +52,10 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ MEM_CGROUP_STAT_NSTATS, /* default hierarchy stats */ - MEMCG_SOCK = MEM_CGROUP_STAT_NSTATS, + MEMCG_KERNEL_STACK = MEM_CGROUP_STAT_NSTATS, MEMCG_SLAB_RECLAIMABLE, MEMCG_SLAB_UNRECLAIMABLE, + MEMCG_SOCK, MEMCG_NR_STAT, }; diff --git a/kernel/fork.c b/kernel/fork.c index 2e391c754ae7..accb7221d547 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -164,12 +164,20 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); + if (page) + memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, + 1 << THREAD_SIZE_ORDER); + return page ? page_address(page) : NULL; } static inline void free_thread_info(struct thread_info *ti) { - free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); + struct page *page = virt_to_page(ti); + + memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, + -(1 << THREAD_SIZE_ORDER)); + __free_kmem_pages(page, THREAD_SIZE_ORDER); } # else static struct kmem_cache *thread_info_cache; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3ad64bf464fd..4b7dda7c2e74 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5106,6 +5106,8 @@ static int memory_stat_show(struct seq_file *m, void *v) (u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE); seq_printf(m, "file %llu\n", (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE); + seq_printf(m, "kernel_stack %llu\n", + (u64)stat[MEMCG_KERNEL_STACK] * PAGE_SIZE); seq_printf(m, "slab %llu\n", (u64)(stat[MEMCG_SLAB_RECLAIMABLE] + stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); -- cgit v1.2.3-70-g09d2 From 832fc1de01aea28255cb11d270679b7f1273f0d7 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 17 Mar 2016 14:17:41 -0700 Subject: /proc/kpageflags: return KPF_BUDDY for "tail" buddy pages Currently /proc/kpageflags returns nothing for "tail" buddy pages, which is inconvenient when grasping how free pages are distributed. This patch sets KPF_BUDDY for such pages. With this patch: $ grep MemFree /proc/meminfo ; tools/vm/page-types -b buddy MemFree: 3134992 kB flags page-count MB symbolic-flags long-symbolic-flags 0x0000000000000400 779272 3044 __________B_______________________________ buddy 0x0000000000000c00 4385 17 __________BM______________________________ buddy,mmap total 783657 3061 783657 pages is 3134628 kB (roughly consistent with the global counter,) so it's OK. [akpm@linux-foundation.org: update comment, per Naoya] Signed-off-by: Naoya Horiguchi Reviewed-by: Vladimir Davydov > Cc: Konstantin Khlebnikov Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/page.c | 6 ++++-- include/linux/page-flags.h | 2 ++ mm/internal.h | 3 --- mm/page_alloc.c | 2 -- 4 files changed, 6 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/fs/proc/page.c b/fs/proc/page.c index b2855eea5405..0be626d85331 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -103,9 +103,9 @@ u64 stable_page_flags(struct page *page) * pseudo flags for the well known (anonymous) memory mapped pages * * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the - * simple test in page_mapcount() is not enough. + * simple test in page_mapped() is not enough. */ - if (!PageSlab(page) && page_mapcount(page)) + if (!PageSlab(page) && page_mapped(page)) u |= 1 << KPF_MMAP; if (PageAnon(page)) u |= 1 << KPF_ANON; @@ -148,6 +148,8 @@ u64 stable_page_flags(struct page *page) */ if (PageBuddy(page)) u |= 1 << KPF_BUDDY; + else if (page_count(page) == 0 && is_free_buddy_page(page)) + u |= 1 << KPF_BUDDY; if (PageBalloon(page)) u |= 1 << KPF_BALLOON; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 19724e6ebd26..597695523679 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -593,6 +593,8 @@ static inline void __ClearPageBuddy(struct page *page) atomic_set(&page->_mapcount, -1); } +extern bool is_free_buddy_page(struct page *page); + #define PAGE_BALLOON_MAPCOUNT_VALUE (-256) static inline int PageBalloon(struct page *page) diff --git a/mm/internal.h b/mm/internal.h index ad9400d759c8..b95952c2faec 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -148,9 +148,6 @@ extern int __isolate_free_page(struct page *page, unsigned int order); extern void __free_pages_bootmem(struct page *page, unsigned long pfn, unsigned int order); extern void prep_compound_page(struct page *page, unsigned int order); -#ifdef CONFIG_MEMORY_FAILURE -extern bool is_free_buddy_page(struct page *page); -#endif extern int user_min_free_kbytes; #if defined CONFIG_COMPACTION || defined CONFIG_CMA diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c46b75d14b6f..c7332a4bc8db 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7152,7 +7152,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) } #endif -#ifdef CONFIG_MEMORY_FAILURE bool is_free_buddy_page(struct page *page) { struct zone *zone = page_zone(page); @@ -7171,4 +7170,3 @@ bool is_free_buddy_page(struct page *page) return order < MAX_ORDER; } -#endif -- cgit v1.2.3-70-g09d2 From 698b1b30642f1ff0ea10ef1de9745ab633031377 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 17 Mar 2016 14:18:08 -0700 Subject: mm, compaction: introduce kcompactd Memory compaction can be currently performed in several contexts: - kswapd balancing a zone after a high-order allocation failure - direct compaction to satisfy a high-order allocation, including THP page fault attemps - khugepaged trying to collapse a hugepage - manually from /proc The purpose of compaction is two-fold. The obvious purpose is to satisfy a (pending or future) high-order allocation, and is easy to evaluate. The other purpose is to keep overal memory fragmentation low and help the anti-fragmentation mechanism. The success wrt the latter purpose is more The current situation wrt the purposes has a few drawbacks: - compaction is invoked only when a high-order page or hugepage is not available (or manually). This might be too late for the purposes of keeping memory fragmentation low. - direct compaction increases latency of allocations. Again, it would be better if compaction was performed asynchronously to keep fragmentation low, before the allocation itself comes. - (a special case of the previous) the cost of compaction during THP page faults can easily offset the benefits of THP. - kswapd compaction appears to be complex, fragile and not working in some scenarios. It could also end up compacting for a high-order allocation request when it should be reclaiming memory for a later order-0 request. To improve the situation, we should be able to benefit from an equivalent of kswapd, but for compaction - i.e. a background thread which responds to fragmentation and the need for high-order allocations (including hugepages) somewhat proactively. One possibility is to extend the responsibilities of kswapd, which could however complicate its design too much. It should be better to let kswapd handle reclaim, as order-0 allocations are often more critical than high-order ones. Another possibility is to extend khugepaged, but this kthread is a single instance and tied to THP configs. This patch goes with the option of a new set of per-node kthreads called kcompactd, and lays the foundations, without introducing any new tunables. The lifecycle mimics kswapd kthreads, including the memory hotplug hooks. For compaction, kcompactd uses the standard compaction_suitable() and ompact_finished() criteria and the deferred compaction functionality. Unlike direct compaction, it uses only sync compaction, as there's no allocation latency to minimize. This patch doesn't yet add a call to wakeup_kcompactd. The kswapd compact/reclaim loop for high-order pages will be replaced by waking up kcompactd in the next patch with the description of what's wrong with the old approach. Waking up of the kcompactd threads is also tied to kswapd activity and follows these rules: - we don't want to affect any fastpaths, so wake up kcompactd only from the slowpath, as it's done for kswapd - if kswapd is doing reclaim, it's more important than compaction, so don't invoke kcompactd until kswapd goes to sleep - the target order used for kswapd is passed to kcompactd Future possible future uses for kcompactd include the ability to wake up kcompactd on demand in special situations, such as when hugepages are not available (currently not done due to __GFP_NO_KSWAPD) or when a fragmentation event (i.e. __rmqueue_fallback()) occurs. It's also possible to perform periodic compaction with kcompactd. [arnd@arndb.de: fix build errors with kcompactd] [paul.gortmaker@windriver.com: don't use modular references for non modular code] Signed-off-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: "Kirill A. Shutemov" Cc: Rik van Riel Cc: Joonsoo Kim Cc: Mel Gorman Cc: David Rientjes Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Arnd Bergmann Signed-off-by: Paul Gortmaker Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 16 +++ include/linux/mmzone.h | 6 ++ include/linux/vm_event_item.h | 1 + include/trace/events/compaction.h | 55 ++++++++++ mm/compaction.c | 222 ++++++++++++++++++++++++++++++++++++++ mm/memory_hotplug.c | 9 +- mm/page_alloc.c | 3 + mm/vmstat.c | 1 + 8 files changed, 311 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 4cd4ddf64cc7..d7c8de583a23 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -52,6 +52,10 @@ extern void compaction_defer_reset(struct zone *zone, int order, bool alloc_success); extern bool compaction_restarting(struct zone *zone, int order); +extern int kcompactd_run(int nid); +extern void kcompactd_stop(int nid); +extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx); + #else static inline unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, int alloc_flags, @@ -84,6 +88,18 @@ static inline bool compaction_deferred(struct zone *zone, int order) return true; } +static inline int kcompactd_run(int nid) +{ + return 0; +} +static inline void kcompactd_stop(int nid) +{ +} + +static inline void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) +{ +} + #endif /* CONFIG_COMPACTION */ #if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 6de02ac378a0..bdd9a270a813 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -668,6 +668,12 @@ typedef struct pglist_data { mem_hotplug_begin/end() */ int kswapd_max_order; enum zone_type classzone_idx; +#ifdef CONFIG_COMPACTION + int kcompactd_max_order; + enum zone_type kcompactd_classzone_idx; + wait_queue_head_t kcompactd_wait; + struct task_struct *kcompactd; +#endif #ifdef CONFIG_NUMA_BALANCING /* Lock serializing the migrate rate limiting window */ spinlock_t numabalancing_migrate_lock; diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 67c1dbd19c6d..58ecc056ee45 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -53,6 +53,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, COMPACTMIGRATE_SCANNED, COMPACTFREE_SCANNED, COMPACTISOLATED, COMPACTSTALL, COMPACTFAIL, COMPACTSUCCESS, + KCOMPACTD_WAKE, #endif #ifdef CONFIG_HUGETLB_PAGE HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL, diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index 111e5666e5eb..e215bf68f521 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -350,6 +350,61 @@ DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_defer_reset, ); #endif +TRACE_EVENT(mm_compaction_kcompactd_sleep, + + TP_PROTO(int nid), + + TP_ARGS(nid), + + TP_STRUCT__entry( + __field(int, nid) + ), + + TP_fast_assign( + __entry->nid = nid; + ), + + TP_printk("nid=%d", __entry->nid) +); + +DECLARE_EVENT_CLASS(kcompactd_wake_template, + + TP_PROTO(int nid, int order, enum zone_type classzone_idx), + + TP_ARGS(nid, order, classzone_idx), + + TP_STRUCT__entry( + __field(int, nid) + __field(int, order) + __field(enum zone_type, classzone_idx) + ), + + TP_fast_assign( + __entry->nid = nid; + __entry->order = order; + __entry->classzone_idx = classzone_idx; + ), + + TP_printk("nid=%d order=%d classzone_idx=%-8s", + __entry->nid, + __entry->order, + __print_symbolic(__entry->classzone_idx, ZONE_TYPE)) +); + +DEFINE_EVENT(kcompactd_wake_template, mm_compaction_wakeup_kcompactd, + + TP_PROTO(int nid, int order, enum zone_type classzone_idx), + + TP_ARGS(nid, order, classzone_idx) +); + +DEFINE_EVENT(kcompactd_wake_template, mm_compaction_kcompactd_wake, + + TP_PROTO(int nid, int order, enum zone_type classzone_idx), + + TP_ARGS(nid, order, classzone_idx) +); + #endif /* _TRACE_COMPACTION_H */ /* This part must be outside protection */ diff --git a/mm/compaction.c b/mm/compaction.c index 93f71d968098..5b2bfbaa821a 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -7,6 +7,7 @@ * * Copyright IBM Corp. 2007-2010 Mel Gorman */ +#include #include #include #include @@ -17,6 +18,8 @@ #include #include #include +#include +#include #include "internal.h" #ifdef CONFIG_COMPACTION @@ -1736,4 +1739,223 @@ void compaction_unregister_node(struct node *node) } #endif /* CONFIG_SYSFS && CONFIG_NUMA */ +static inline bool kcompactd_work_requested(pg_data_t *pgdat) +{ + return pgdat->kcompactd_max_order > 0; +} + +static bool kcompactd_node_suitable(pg_data_t *pgdat) +{ + int zoneid; + struct zone *zone; + enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx; + + for (zoneid = 0; zoneid < classzone_idx; zoneid++) { + zone = &pgdat->node_zones[zoneid]; + + if (!populated_zone(zone)) + continue; + + if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0, + classzone_idx) == COMPACT_CONTINUE) + return true; + } + + return false; +} + +static void kcompactd_do_work(pg_data_t *pgdat) +{ + /* + * With no special task, compact all zones so that a page of requested + * order is allocatable. + */ + int zoneid; + struct zone *zone; + struct compact_control cc = { + .order = pgdat->kcompactd_max_order, + .classzone_idx = pgdat->kcompactd_classzone_idx, + .mode = MIGRATE_SYNC_LIGHT, + .ignore_skip_hint = true, + + }; + bool success = false; + + trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, + cc.classzone_idx); + count_vm_event(KCOMPACTD_WAKE); + + for (zoneid = 0; zoneid < cc.classzone_idx; zoneid++) { + int status; + + zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + if (compaction_deferred(zone, cc.order)) + continue; + + if (compaction_suitable(zone, cc.order, 0, zoneid) != + COMPACT_CONTINUE) + continue; + + cc.nr_freepages = 0; + cc.nr_migratepages = 0; + cc.zone = zone; + INIT_LIST_HEAD(&cc.freepages); + INIT_LIST_HEAD(&cc.migratepages); + + status = compact_zone(zone, &cc); + + if (zone_watermark_ok(zone, cc.order, low_wmark_pages(zone), + cc.classzone_idx, 0)) { + success = true; + compaction_defer_reset(zone, cc.order, false); + } else if (status == COMPACT_COMPLETE) { + /* + * We use sync migration mode here, so we defer like + * sync direct compaction does. + */ + defer_compaction(zone, cc.order); + } + + VM_BUG_ON(!list_empty(&cc.freepages)); + VM_BUG_ON(!list_empty(&cc.migratepages)); + } + + /* + * Regardless of success, we are done until woken up next. But remember + * the requested order/classzone_idx in case it was higher/tighter than + * our current ones + */ + if (pgdat->kcompactd_max_order <= cc.order) + pgdat->kcompactd_max_order = 0; + if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx) + pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; +} + +void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) +{ + if (!order) + return; + + if (pgdat->kcompactd_max_order < order) + pgdat->kcompactd_max_order = order; + + if (pgdat->kcompactd_classzone_idx > classzone_idx) + pgdat->kcompactd_classzone_idx = classzone_idx; + + if (!waitqueue_active(&pgdat->kcompactd_wait)) + return; + + if (!kcompactd_node_suitable(pgdat)) + return; + + trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order, + classzone_idx); + wake_up_interruptible(&pgdat->kcompactd_wait); +} + +/* + * The background compaction daemon, started as a kernel thread + * from the init process. + */ +static int kcompactd(void *p) +{ + pg_data_t *pgdat = (pg_data_t*)p; + struct task_struct *tsk = current; + + const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); + + if (!cpumask_empty(cpumask)) + set_cpus_allowed_ptr(tsk, cpumask); + + set_freezable(); + + pgdat->kcompactd_max_order = 0; + pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; + + while (!kthread_should_stop()) { + trace_mm_compaction_kcompactd_sleep(pgdat->node_id); + wait_event_freezable(pgdat->kcompactd_wait, + kcompactd_work_requested(pgdat)); + + kcompactd_do_work(pgdat); + } + + return 0; +} + +/* + * This kcompactd start function will be called by init and node-hot-add. + * On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added. + */ +int kcompactd_run(int nid) +{ + pg_data_t *pgdat = NODE_DATA(nid); + int ret = 0; + + if (pgdat->kcompactd) + return 0; + + pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid); + if (IS_ERR(pgdat->kcompactd)) { + pr_err("Failed to start kcompactd on node %d\n", nid); + ret = PTR_ERR(pgdat->kcompactd); + pgdat->kcompactd = NULL; + } + return ret; +} + +/* + * Called by memory hotplug when all memory in a node is offlined. Caller must + * hold mem_hotplug_begin/end(). + */ +void kcompactd_stop(int nid) +{ + struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd; + + if (kcompactd) { + kthread_stop(kcompactd); + NODE_DATA(nid)->kcompactd = NULL; + } +} + +/* + * It's optimal to keep kcompactd on the same CPUs as their memory, but + * not required for correctness. So if the last cpu in a node goes + * away, we get changed to run anywhere: as the first one comes back, + * restore their cpu bindings. + */ +static int cpu_callback(struct notifier_block *nfb, unsigned long action, + void *hcpu) +{ + int nid; + + if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { + for_each_node_state(nid, N_MEMORY) { + pg_data_t *pgdat = NODE_DATA(nid); + const struct cpumask *mask; + + mask = cpumask_of_node(pgdat->node_id); + + if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) + /* One of our CPUs online: restore mask */ + set_cpus_allowed_ptr(pgdat->kcompactd, mask); + } + } + return NOTIFY_OK; +} + +static int __init kcompactd_init(void) +{ + int nid; + + for_each_node_state(nid, N_MEMORY) + kcompactd_run(nid); + hotcpu_notifier(cpu_callback, 0); + return 0; +} +subsys_initcall(kcompactd_init) + #endif /* CONFIG_COMPACTION */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 24ea06393816..d9bcb26fc4df 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -33,6 +33,7 @@ #include #include #include +#include #include @@ -1105,8 +1106,10 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ init_per_zone_wmark_min(); - if (onlined_pages) + if (onlined_pages) { kswapd_run(zone_to_nid(zone)); + kcompactd_run(nid); + } vm_total_pages = nr_free_pagecache_pages(); @@ -1880,8 +1883,10 @@ repeat: zone_pcp_update(zone); node_states_clear_node(node, &arg); - if (arg.status_change_nid >= 0) + if (arg.status_change_nid >= 0) { kswapd_stop(node); + kcompactd_stop(node); + } vm_total_pages = nr_free_pagecache_pages(); writeback_set_ratelimit(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b1fc19ebb8a2..25a75da53c27 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5405,6 +5405,9 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) #endif init_waitqueue_head(&pgdat->kswapd_wait); init_waitqueue_head(&pgdat->pfmemalloc_wait); +#ifdef CONFIG_COMPACTION + init_waitqueue_head(&pgdat->kcompactd_wait); +#endif pgdat_page_ext_init(pgdat); for (j = 0; j < MAX_NR_ZONES; j++) { diff --git a/mm/vmstat.c b/mm/vmstat.c index 69ce64f7b8d7..f80066248c94 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -826,6 +826,7 @@ const char * const vmstat_text[] = { "compact_stall", "compact_fail", "compact_success", + "compact_daemon_wake", #endif #ifdef CONFIG_HUGETLB_PAGE -- cgit v1.2.3-70-g09d2 From ee91ef6173e81819f5ff610c2485802081635657 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 17 Mar 2016 14:18:21 -0700 Subject: bufferhead: force inlining of buffer head flag operations With both gcc 4.7.2 and 4.9.2, sometimes gcc mysteriously doesn't inline very small functions we expect to be inlined. See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122 With this .config: http://busybox.net/~vda/kernel_config_OPTIMIZE_INLINING_and_Os, set_buffer_foo(), clear_buffer_foo() and similar functions get deinlined about 60 times. Examples of disassembly: (14 copies, 43 calls): 55 push %rbp 48 89 e5 mov %rsp,%rbp f0 80 0f 20 lock orb $0x20,(%rdi) 5d pop %rbp c3 retq (3 copies, 34 calls): 48 8b 07 mov (%rdi),%rax 55 push %rbp 48 89 e5 mov %rsp,%rbp 48 c1 e8 05 shr $0x5,%rax 83 e0 01 and $0x1,%eax 5d pop %rbp c3 retq (5 copies, 13 calls): 55 push %rbp 48 89 e5 mov %rsp,%rbp f0 80 0f 40 lock orb $0x40,(%rdi) 5d pop %rbp c3 retq This patch fixes this via s/inline/__always_inline/. This decreases vmlinux by about 3 kbytes. text data bss dec hex filename 88200439 19905208 36421632 144527279 89d4faf vmlinux2 88197239 19905240 36421632 144524111 89d434f vmlinux Signed-off-by: Denys Vlasenko Cc: Ingo Molnar Cc: Thomas Graf Cc: Peter Zijlstra Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/buffer_head.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 89d9aa9e79bf..c67f052cc5e5 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -82,15 +82,15 @@ struct buffer_head { * and buffer_foo() functions. */ #define BUFFER_FNS(bit, name) \ -static inline void set_buffer_##name(struct buffer_head *bh) \ +static __always_inline void set_buffer_##name(struct buffer_head *bh) \ { \ set_bit(BH_##bit, &(bh)->b_state); \ } \ -static inline void clear_buffer_##name(struct buffer_head *bh) \ +static __always_inline void clear_buffer_##name(struct buffer_head *bh) \ { \ clear_bit(BH_##bit, &(bh)->b_state); \ } \ -static inline int buffer_##name(const struct buffer_head *bh) \ +static __always_inline int buffer_##name(const struct buffer_head *bh) \ { \ return test_bit(BH_##bit, &(bh)->b_state); \ } @@ -99,11 +99,11 @@ static inline int buffer_##name(const struct buffer_head *bh) \ * test_set_buffer_foo() and test_clear_buffer_foo() */ #define TAS_BUFFER_FNS(bit, name) \ -static inline int test_set_buffer_##name(struct buffer_head *bh) \ +static __always_inline int test_set_buffer_##name(struct buffer_head *bh) \ { \ return test_and_set_bit(BH_##bit, &(bh)->b_state); \ } \ -static inline int test_clear_buffer_##name(struct buffer_head *bh) \ +static __always_inline int test_clear_buffer_##name(struct buffer_head *bh) \ { \ return test_and_clear_bit(BH_##bit, &(bh)->b_state); \ } \ -- cgit v1.2.3-70-g09d2 From 4b0f326163f09e6d67f09dd5e1a70093cee710fc Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 17 Mar 2016 14:18:24 -0700 Subject: include/linux/page-flags.h: force inlining of selected page flag modifications Sometimes gcc mysteriously doesn't inline very small functions we expect to be inlined. See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122 With this .config: http://busybox.net/~vda/kernel_config_OPTIMIZE_INLINING_and_Os, the following functions get deinlined many times. Examples of disassembly: (43 copies, 141 calls): 55 push %rbp 48 89 e5 mov %rsp,%rbp f0 80 0f 08 lock orb $0x8,(%rdi) 5d pop %rbp c3 retq (10 copies, 134 calls): 48 8b 07 mov (%rdi),%rax 55 push %rbp 48 89 e5 mov %rsp,%rbp 48 c1 e8 0b shr $0xb,%rax 83 e0 01 and $0x1,%eax 5d pop %rbp c3 retq This patch fixes this via s/inline/__always_inline/. Code size decrease after the patch is ~7k: text data bss dec hex filename 92125002 20826048 36417536 149368586 8e72f0a vmlinux 92118087 20826112 36417536 149361735 8e71447 vmlinux7_pageops_after Signed-off-by: Denys Vlasenko Cc: Ingo Molnar Cc: Thomas Graf Cc: Peter Zijlstra Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 597695523679..f4ed4f1b0c77 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -144,12 +144,12 @@ static inline struct page *compound_head(struct page *page) return page; } -static inline int PageTail(struct page *page) +static __always_inline int PageTail(struct page *page) { return READ_ONCE(page->compound_head) & 1; } -static inline int PageCompound(struct page *page) +static __always_inline int PageCompound(struct page *page) { return test_bit(PG_head, &page->flags) || PageTail(page); } @@ -184,31 +184,31 @@ static inline int PageCompound(struct page *page) * Macros to create function definitions for page flags */ #define TESTPAGEFLAG(uname, lname, policy) \ -static inline int Page##uname(struct page *page) \ +static __always_inline int Page##uname(struct page *page) \ { return test_bit(PG_##lname, &policy(page, 0)->flags); } #define SETPAGEFLAG(uname, lname, policy) \ -static inline void SetPage##uname(struct page *page) \ +static __always_inline void SetPage##uname(struct page *page) \ { set_bit(PG_##lname, &policy(page, 1)->flags); } #define CLEARPAGEFLAG(uname, lname, policy) \ -static inline void ClearPage##uname(struct page *page) \ +static __always_inline void ClearPage##uname(struct page *page) \ { clear_bit(PG_##lname, &policy(page, 1)->flags); } #define __SETPAGEFLAG(uname, lname, policy) \ -static inline void __SetPage##uname(struct page *page) \ +static __always_inline void __SetPage##uname(struct page *page) \ { __set_bit(PG_##lname, &policy(page, 1)->flags); } #define __CLEARPAGEFLAG(uname, lname, policy) \ -static inline void __ClearPage##uname(struct page *page) \ +static __always_inline void __ClearPage##uname(struct page *page) \ { __clear_bit(PG_##lname, &policy(page, 1)->flags); } #define TESTSETFLAG(uname, lname, policy) \ -static inline int TestSetPage##uname(struct page *page) \ +static __always_inline int TestSetPage##uname(struct page *page) \ { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); } #define TESTCLEARFLAG(uname, lname, policy) \ -static inline int TestClearPage##uname(struct page *page) \ +static __always_inline int TestClearPage##uname(struct page *page) \ { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); } #define PAGEFLAG(uname, lname, policy) \ @@ -371,7 +371,7 @@ PAGEFLAG(Idle, idle, PF_ANY) #define PAGE_MAPPING_KSM 2 #define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM) -static inline int PageAnon(struct page *page) +static __always_inline int PageAnon(struct page *page) { page = compound_head(page); return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; @@ -384,7 +384,7 @@ static inline int PageAnon(struct page *page) * is found in VM_MERGEABLE vmas. It's a PageAnon page, pointing not to any * anon_vma, but to that page's node of the stable tree. */ -static inline int PageKsm(struct page *page) +static __always_inline int PageKsm(struct page *page) { page = compound_head(page); return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) == @@ -415,14 +415,14 @@ static inline int PageUptodate(struct page *page) return ret; } -static inline void __SetPageUptodate(struct page *page) +static __always_inline void __SetPageUptodate(struct page *page) { VM_BUG_ON_PAGE(PageTail(page), page); smp_wmb(); __set_bit(PG_uptodate, &page->flags); } -static inline void SetPageUptodate(struct page *page) +static __always_inline void SetPageUptodate(struct page *page) { VM_BUG_ON_PAGE(PageTail(page), page); /* @@ -456,12 +456,12 @@ static inline void set_page_writeback_keepwrite(struct page *page) __PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY) -static inline void set_compound_head(struct page *page, struct page *head) +static __always_inline void set_compound_head(struct page *page, struct page *head) { WRITE_ONCE(page->compound_head, (unsigned long)head + 1); } -static inline void clear_compound_head(struct page *page) +static __always_inline void clear_compound_head(struct page *page) { WRITE_ONCE(page->compound_head, 0); } -- cgit v1.2.3-70-g09d2 From b6ecd2dea4435a771a99c497a6ac5df6d3618c5a Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:18:33 -0700 Subject: mm: memcontrol: zap memcg_kmem_online helper As kmem accounting is now either enabled for all cgroups or disabled system-wide, there's no point in having memcg_kmem_online() helper - instead one can use memcg_kmem_enabled() and mem_cgroup_online(), as shrink_slab() now does. There are only two places left where this helper is used - __memcg_kmem_charge() and memcg_create_kmem_cache(). The former can only be called if memcg_kmem_enabled() returned true. Since the cgroup it operates on is online, mem_cgroup_is_root() check will be enough. memcg_create_kmem_cache() can't use mem_cgroup_online() helper instead of memcg_kmem_online(), because it relies on the fact that in memcg_offline_kmem() memcg->kmem_state is changed before memcg_deactivate_kmem_caches() is called, but there we can just open-code the check. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 10 ---------- mm/memcontrol.c | 2 +- mm/slab_common.c | 2 +- 3 files changed, 2 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d6300313b298..bc8e4e22f58f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -795,11 +795,6 @@ static inline bool memcg_kmem_enabled(void) return static_branch_unlikely(&memcg_kmem_enabled_key); } -static inline bool memcg_kmem_online(struct mem_cgroup *memcg) -{ - return memcg->kmem_state == KMEM_ONLINE; -} - /* * In general, we'll do everything in our power to not incur in any overhead * for non-memcg users for the kmem functions. Not even a function call, if we @@ -909,11 +904,6 @@ static inline bool memcg_kmem_enabled(void) return false; } -static inline bool memcg_kmem_online(struct mem_cgroup *memcg) -{ - return false; -} - static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) { return 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 28d1b1e9d4fb..341bf86d26c2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2346,7 +2346,7 @@ int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) int ret = 0; memcg = get_mem_cgroup_from_mm(current->mm); - if (memcg_kmem_online(memcg)) + if (!mem_cgroup_is_root(memcg)) ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); css_put(&memcg->css); return ret; diff --git a/mm/slab_common.c b/mm/slab_common.c index 6afb2263a5c5..8addc3c4df37 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -510,7 +510,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, * The memory cgroup could have been offlined while the cache * creation work was pending. */ - if (!memcg_kmem_online(memcg)) + if (memcg->kmem_state != KMEM_ONLINE) goto out_unlock; idx = memcg_cache_id(memcg); -- cgit v1.2.3-70-g09d2 From 0a6b76dd23fa08c5fd7b68acdb55018a37afd4aa Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:18:42 -0700 Subject: mm: workingset: make shadow node shrinker memcg aware Workingset code was recently made memcg aware, but shadow node shrinker is still global. As a result, one small cgroup can consume all memory available for shadow nodes, possibly hurting other cgroups by reclaiming their shadow nodes, even though reclaim distances stored in its shadow nodes have no effect. To avoid this, we need to make shadow node shrinker memcg aware. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 10 ++++++++++ mm/memcontrol.c | 5 ++--- mm/workingset.c | 10 +++++++--- 3 files changed, 19 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index bc8e4e22f58f..1191d79aa495 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -403,6 +403,9 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, int nr_pages); +unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, + int nid, unsigned int lru_mask); + static inline unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) { @@ -661,6 +664,13 @@ mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, { } +static inline unsigned long +mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, + int nid, unsigned int lru_mask) +{ + return 0; +} + static inline void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 341bf86d26c2..ae8b81c55685 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -638,9 +638,8 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, __this_cpu_add(memcg->stat->nr_page_events, nr_pages); } -static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, - int nid, - unsigned int lru_mask) +unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, + int nid, unsigned int lru_mask) { unsigned long nr = 0; int zid; diff --git a/mm/workingset.c b/mm/workingset.c index 68e8cd94ebe4..8a75f8d2916a 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -349,8 +349,12 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc); local_irq_enable(); - pages = node_page_state(sc->nid, NR_ACTIVE_FILE) + - node_page_state(sc->nid, NR_INACTIVE_FILE); + if (memcg_kmem_enabled()) + pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, + LRU_ALL_FILE); + else + pages = node_page_state(sc->nid, NR_ACTIVE_FILE) + + node_page_state(sc->nid, NR_INACTIVE_FILE); /* * Active cache pages are limited to 50% of memory, and shadow @@ -460,7 +464,7 @@ static struct shrinker workingset_shadow_shrinker = { .count_objects = count_shadow_nodes, .scan_objects = scan_shadow_nodes, .seeks = DEFAULT_SEEKS, - .flags = SHRINKER_NUMA_AWARE, + .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, }; /* -- cgit v1.2.3-70-g09d2 From f9719a03de51e13526d614e79d002f838770b2d6 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 17 Mar 2016 14:18:45 -0700 Subject: thp, vmstats: count deferred split events Count how many times we put a THP in split queue. Currently, it happens on partial unmap of a THP. Rapidly growing value can indicate that an application behaves unfriendly wrt THP: often fault in huge page and then unmap part of it. This leads to unnecessary memory fragmentation and the application may require tuning. The event also can help with debugging kernel [mis-]behaviour. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/transhuge.txt | 5 +++++ include/linux/vm_event_item.h | 1 + mm/huge_memory.c | 1 + mm/vmstat.c | 1 + 4 files changed, 8 insertions(+) (limited to 'include') diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt index 21cf34f3ddb2..0dc8632aa01e 100644 --- a/Documentation/vm/transhuge.txt +++ b/Documentation/vm/transhuge.txt @@ -229,6 +229,11 @@ thp_split_page is incremented every time a huge page is split into base thp_split_page_failed is is incremented if kernel fails to split huge page. This can happen if the page was pinned by somebody. +thp_deferred_split_page is incremented when a huge page is put onto split + queue. This happens when a huge page is partially unmapped and + splitting it would free up some memory. Pages on split queue are + going to be split under memory pressure. + thp_split_pmd is incremented every time a PMD split into table of PTEs. This can happen, for instance, when application calls mprotect() or munmap() on part of huge page. It doesn't split huge page, only diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 58ecc056ee45..ec084321fe09 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -72,6 +72,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, THP_COLLAPSE_ALLOC_FAILED, THP_SPLIT_PAGE, THP_SPLIT_PAGE_FAILED, + THP_DEFERRED_SPLIT_PAGE, THP_SPLIT_PMD, THP_ZERO_PAGE_ALLOC, THP_ZERO_PAGE_ALLOC_FAILED, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1ea21e203a70..1dddfb21fc22 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3455,6 +3455,7 @@ void deferred_split_huge_page(struct page *page) spin_lock_irqsave(&pgdata->split_queue_lock, flags); if (list_empty(page_deferred_list(page))) { + count_vm_event(THP_DEFERRED_SPLIT_PAGE); list_add_tail(page_deferred_list(page), &pgdata->split_queue); pgdata->split_queue_len++; } diff --git a/mm/vmstat.c b/mm/vmstat.c index f80066248c94..5e4300482897 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -848,6 +848,7 @@ const char * const vmstat_text[] = { "thp_collapse_alloc_failed", "thp_split_page", "thp_split_page_failed", + "thp_deferred_split_page", "thp_split_pmd", "thp_zero_page_alloc", "thp_zero_page_alloc_failed", -- cgit v1.2.3-70-g09d2 From ea606cf5d8df370e7932460dfd960b21f20e7c6d Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 17 Mar 2016 14:18:48 -0700 Subject: mm: move max_map_count bits into mm.h max_map_count sysctl unrelated to scheduler. Move its bits from include/linux/sched/sysctl.h to include/linux/mm.h. Signed-off-by: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 21 +++++++++++++++++++++ include/linux/sched/sysctl.h | 21 --------------------- mm/mmap.c | 1 - mm/mremap.c | 1 - mm/nommu.c | 1 - 5 files changed, 21 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index dbf1eddab964..6922adf41938 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -82,6 +82,27 @@ extern int mmap_rnd_compat_bits __read_mostly; #define mm_forbids_zeropage(X) (0) #endif +/* + * Default maximum number of active map areas, this limits the number of vmas + * per mm struct. Users can overwrite this number by sysctl but there is a + * problem. + * + * When a program's coredump is generated as ELF format, a section is created + * per a vma. In ELF, the number of sections is represented in unsigned short. + * This means the number of sections should be smaller than 65535 at coredump. + * Because the kernel adds some informative sections to a image of program at + * generating coredump, we need some margin. The number of extra sections is + * 1-3 now and depends on arch. We use "5" as safe margin, here. + * + * ELF extended numbering allows more than 65535 sections, so 16-bit bound is + * not a hard limit any more. Although some userspace tools can be surprised by + * that. + */ +#define MAPCOUNT_ELF_CORE_MARGIN (5) +#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) + +extern int sysctl_max_map_count; + extern unsigned long sysctl_user_reserve_kbytes; extern unsigned long sysctl_admin_reserve_kbytes; diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 4f080ab4f2cd..22db1e63707e 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -14,27 +14,6 @@ extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, enum { sysctl_hung_task_timeout_secs = 0 }; #endif -/* - * Default maximum number of active map areas, this limits the number of vmas - * per mm struct. Users can overwrite this number by sysctl but there is a - * problem. - * - * When a program's coredump is generated as ELF format, a section is created - * per a vma. In ELF, the number of sections is represented in unsigned short. - * This means the number of sections should be smaller than 65535 at coredump. - * Because the kernel adds some informative sections to a image of program at - * generating coredump, we need some margin. The number of extra sections is - * 1-3 now and depends on arch. We use "5" as safe margin, here. - * - * ELF extended numbering allows more than 65535 sections, so 16-bit bound is - * not a hard limit any more. Although some userspace tools can be surprised by - * that. - */ -#define MAPCOUNT_ELF_CORE_MARGIN (5) -#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) - -extern int sysctl_max_map_count; - extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; diff --git a/mm/mmap.c b/mm/mmap.c index 90e3b869a8b9..676f422f2e2c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include diff --git a/mm/mremap.c b/mm/mremap.c index 8eeba02fc991..e30c8a6489a6 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include diff --git a/mm/nommu.c b/mm/nommu.c index fbf6f0f1d6c9..9bdf8b119078 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -33,7 +33,6 @@ #include #include #include -#include #include #include -- cgit v1.2.3-70-g09d2 From bcf6691797f425b301f629bb783b7ff2d0bcfa5a Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 17 Mar 2016 14:18:53 -0700 Subject: mm, tracing: refresh __def_vmaflag_names Get list of VMA flags up-to-date and sort it to match VM_* definition order. [vbabka@suse.cz: add a note above vmaflag definitions to update the names when changing] Signed-off-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Signed-off-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + include/trace/events/mmflags.h | 23 ++++++++++++++++------- 2 files changed, 17 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6922adf41938..b3202612bb95 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -143,6 +143,7 @@ extern unsigned int kobjsize(const void *objp); /* * vm_flags in vm_area_struct, see mm_types.h. + * When changing, update also include/trace/events/mmflags.h */ #define VM_NONE 0x00000000 diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index a849185c82f0..43cedbf0c759 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -111,15 +111,21 @@ IF_HAVE_PG_IDLE(PG_idle, "idle" ) ) : "none" #if defined(CONFIG_X86) -#define __VM_ARCH_SPECIFIC {VM_PAT, "pat" } +#define __VM_ARCH_SPECIFIC_1 {VM_PAT, "pat" } #elif defined(CONFIG_PPC) -#define __VM_ARCH_SPECIFIC {VM_SAO, "sao" } +#define __VM_ARCH_SPECIFIC_1 {VM_SAO, "sao" } #elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64) -#define __VM_ARCH_SPECIFIC {VM_GROWSUP, "growsup" } +#define __VM_ARCH_SPECIFIC_1 {VM_GROWSUP, "growsup" } #elif !defined(CONFIG_MMU) -#define __VM_ARCH_SPECIFIC {VM_MAPPED_COPY,"mappedcopy" } +#define __VM_ARCH_SPECIFIC_1 {VM_MAPPED_COPY,"mappedcopy" } #else -#define __VM_ARCH_SPECIFIC {VM_ARCH_1, "arch_1" } +#define __VM_ARCH_SPECIFIC_1 {VM_ARCH_1, "arch_1" } +#endif + +#if defined(CONFIG_X86) +#define __VM_ARCH_SPECIFIC_2 {VM_MPX, "mpx" } +#else +#define __VM_ARCH_SPECIFIC_2 {VM_ARCH_2, "arch_2" } #endif #ifdef CONFIG_MEM_SOFT_DIRTY @@ -138,19 +144,22 @@ IF_HAVE_PG_IDLE(PG_idle, "idle" ) {VM_MAYEXEC, "mayexec" }, \ {VM_MAYSHARE, "mayshare" }, \ {VM_GROWSDOWN, "growsdown" }, \ + {VM_UFFD_MISSING, "uffd_missing" }, \ {VM_PFNMAP, "pfnmap" }, \ {VM_DENYWRITE, "denywrite" }, \ - {VM_LOCKONFAULT, "lockonfault" }, \ + {VM_UFFD_WP, "uffd_wp" }, \ {VM_LOCKED, "locked" }, \ {VM_IO, "io" }, \ {VM_SEQ_READ, "seqread" }, \ {VM_RAND_READ, "randread" }, \ {VM_DONTCOPY, "dontcopy" }, \ {VM_DONTEXPAND, "dontexpand" }, \ + {VM_LOCKONFAULT, "lockonfault" }, \ {VM_ACCOUNT, "account" }, \ {VM_NORESERVE, "noreserve" }, \ {VM_HUGETLB, "hugetlb" }, \ - __VM_ARCH_SPECIFIC , \ + __VM_ARCH_SPECIFIC_1 , \ + __VM_ARCH_SPECIFIC_2 , \ {VM_DONTDUMP, "dontdump" }, \ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \ {VM_MIXEDMAP, "mixedmap" }, \ -- cgit v1.2.3-70-g09d2 From 458aa76d132dc1c3c60be0f0db99bcc0ce1767fc Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 17 Mar 2016 14:18:56 -0700 Subject: mm/thp/migration: switch from flush_tlb_range to flush_pmd_tlb_range We remove one instace of flush_tlb_range here. That was added by commit f714f4f20e59 ("mm: numa: call MMU notifiers on THP migration"). But the pmdp_huge_clear_flush_notify should have done the require flush for us. Hence remove the extra flush. Signed-off-by: Aneesh Kumar K.V Cc: Mel Gorman Cc: "Kirill A. Shutemov" Cc: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/pgtable.h | 17 +++++++++++++++++ mm/migrate.c | 8 +++++--- mm/pgtable-generic.c | 14 -------------- 3 files changed, 22 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index c370b261c720..9401f4819891 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -783,6 +783,23 @@ static inline int pmd_clear_huge(pmd_t *pmd) } #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ +#ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* + * ARCHes with special requirements for evicting THP backing TLB entries can + * implement this. Otherwise also, it can help optimize normal TLB flush in + * THP regime. stock flush_tlb_range() typically has optimization to nuke the + * entire TLB TLB if flush span is greater than a threshold, which will + * likely be true for a single huge page. Thus a single thp flush will + * invalidate the entire TLB which is not desitable. + * e.g. see arch/arc: flush_pmd_tlb_range + */ +#define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) +#else +#define flush_pmd_tlb_range(vma, addr, end) BUILD_BUG() +#endif +#endif + #endif /* !__ASSEMBLY__ */ #ifndef io_remap_pfn_range diff --git a/mm/migrate.c b/mm/migrate.c index 568284ec75d4..fdaf0818fb30 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1773,7 +1773,10 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, put_page(new_page); goto out_fail; } - + /* + * We are not sure a pending tlb flush here is for a huge page + * mapping or not. Hence use the tlb range variant + */ if (mm_tlb_flush_pending(mm)) flush_tlb_range(vma, mmun_start, mmun_end); @@ -1829,12 +1832,11 @@ fail_putback: page_add_anon_rmap(new_page, vma, mmun_start, true); pmdp_huge_clear_flush_notify(vma, mmun_start, pmd); set_pmd_at(mm, mmun_start, pmd, entry); - flush_tlb_range(vma, mmun_start, mmun_end); update_mmu_cache_pmd(vma, address, &entry); if (page_count(page) != 2) { set_pmd_at(mm, mmun_start, pmd, orig_entry); - flush_tlb_range(vma, mmun_start, mmun_end); + flush_pmd_tlb_range(vma, mmun_start, mmun_end); mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); update_mmu_cache_pmd(vma, address, &entry); page_remove_rmap(new_page, true); diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 06a005b979a7..71c5f9109f2a 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -84,20 +84,6 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE - -/* - * ARCHes with special requirements for evicting THP backing TLB entries can - * implement this. Otherwise also, it can help optimize normal TLB flush in - * THP regime. stock flush_tlb_range() typically has optimization to nuke the - * entire TLB if flush span is greater than a threshold, which will - * likely be true for a single huge page. Thus a single thp flush will - * invalidate the entire TLB which is not desirable. - * e.g. see arch/arc: flush_pmd_tlb_range - */ -#define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) -#endif - #ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, -- cgit v1.2.3-70-g09d2 From d02bd27bd33dd7e8d22594cd568b81be0cb584cd Mon Sep 17 00:00:00 2001 From: Igor Redko Date: Thu, 17 Mar 2016 14:19:05 -0700 Subject: mm/page_alloc.c: calculate 'available' memory in a separate function Add a new field, VIRTIO_BALLOON_S_AVAIL, to virtio_balloon memory statistics protocol, corresponding to 'Available' in /proc/meminfo. It indicates to the hypervisor how big the balloon can be inflated without pushing the guest system to swap. This metric would be very useful in VM orchestration software to improve memory management of different VMs under overcommit. This patch (of 2): Factor out calculation of the available memory counter into a separate exportable function, in order to be able to use it in other parts of the kernel. In particular, it appears a relevant metric to report to the hypervisor via virtio-balloon statistics interface (in a followup patch). Signed-off-by: Igor Redko Signed-off-by: Denis V. Lunev Reviewed-by: Roman Kagan Cc: Michael S. Tsirkin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/meminfo.c | 31 +------------------------------ include/linux/mm.h | 1 + mm/page_alloc.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 45 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index df4661abadc4..83720460c5bc 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -29,10 +29,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) unsigned long committed; long cached; long available; - unsigned long pagecache; - unsigned long wmark_low = 0; unsigned long pages[NR_LRU_LISTS]; - struct zone *zone; int lru; /* @@ -51,33 +48,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) pages[lru] = global_page_state(NR_LRU_BASE + lru); - for_each_zone(zone) - wmark_low += zone->watermark[WMARK_LOW]; - - /* - * Estimate the amount of memory available for userspace allocations, - * without causing swapping. - */ - available = i.freeram - totalreserve_pages; - - /* - * Not all the page cache can be freed, otherwise the system will - * start swapping. Assume at least half of the page cache, or the - * low watermark worth of cache, needs to stay. - */ - pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; - pagecache -= min(pagecache / 2, wmark_low); - available += pagecache; - - /* - * Part of the reclaimable slab consists of items that are in use, - * and cannot be freed. Cap this estimate at the low watermark. - */ - available += global_page_state(NR_SLAB_RECLAIMABLE) - - min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low); - - if (available < 0) - available = 0; + available = si_mem_available(); /* * Tagged format, for easy grepping and expansion. diff --git a/include/linux/mm.h b/include/linux/mm.h index b3202612bb95..db9df3f78de1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1875,6 +1875,7 @@ extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); extern void __init mmap_init(void); extern void show_mem(unsigned int flags); +extern long si_mem_available(void); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 25a75da53c27..941b802e11ec 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3713,6 +3713,49 @@ static inline void show_node(struct zone *zone) printk("Node %d ", zone_to_nid(zone)); } +long si_mem_available(void) +{ + long available; + unsigned long pagecache; + unsigned long wmark_low = 0; + unsigned long pages[NR_LRU_LISTS]; + struct zone *zone; + int lru; + + for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) + pages[lru] = global_page_state(NR_LRU_BASE + lru); + + for_each_zone(zone) + wmark_low += zone->watermark[WMARK_LOW]; + + /* + * Estimate the amount of memory available for userspace allocations, + * without causing swapping. + */ + available = global_page_state(NR_FREE_PAGES) - totalreserve_pages; + + /* + * Not all the page cache can be freed, otherwise the system will + * start swapping. Assume at least half of the page cache, or the + * low watermark worth of cache, needs to stay. + */ + pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; + pagecache -= min(pagecache / 2, wmark_low); + available += pagecache; + + /* + * Part of the reclaimable slab consists of items that are in use, + * and cannot be freed. Cap this estimate at the low watermark. + */ + available += global_page_state(NR_SLAB_RECLAIMABLE) - + min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low); + + if (available < 0) + available = 0; + return available; +} +EXPORT_SYMBOL_GPL(si_mem_available); + void si_meminfo(struct sysinfo *val) { val->totalram = totalram_pages; -- cgit v1.2.3-70-g09d2 From 5057dcd0f1aaad57e07e728ba20a99e205c6b9de Mon Sep 17 00:00:00 2001 From: Igor Redko Date: Thu, 17 Mar 2016 14:19:08 -0700 Subject: virtio_balloon: export 'available' memory to balloon statistics Add a new field, VIRTIO_BALLOON_S_AVAIL, to virtio_balloon memory statistics protocol, corresponding to 'Available' in /proc/meminfo. It indicates to the hypervisor how big the balloon can be inflated without pushing the guest system to swap. Signed-off-by: Igor Redko Signed-off-by: Denis V. Lunev Reviewed-by: Roman Kagan Cc: Michael S. Tsirkin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/virtio/virtio_balloon.c | 6 ++++++ include/uapi/linux/virtio_balloon.h | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 0c3691f46575..f2b77dea8d3c 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -30,6 +30,7 @@ #include #include #include +#include /* * Balloon device works in 4K page units. So each page is pointed to by @@ -229,10 +230,13 @@ static void update_balloon_stats(struct virtio_balloon *vb) unsigned long events[NR_VM_EVENT_ITEMS]; struct sysinfo i; int idx = 0; + long available; all_vm_events(events); si_meminfo(&i); + available = si_mem_available(); + update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_IN, pages_to_bytes(events[PSWPIN])); update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_OUT, @@ -243,6 +247,8 @@ static void update_balloon_stats(struct virtio_balloon *vb) pages_to_bytes(i.freeram)); update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMTOT, pages_to_bytes(i.totalram)); + update_stat(vb, idx++, VIRTIO_BALLOON_S_AVAIL, + pages_to_bytes(available)); } /* diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h index d7f1cbc3766c..343d7ddefe04 100644 --- a/include/uapi/linux/virtio_balloon.h +++ b/include/uapi/linux/virtio_balloon.h @@ -51,7 +51,8 @@ struct virtio_balloon_config { #define VIRTIO_BALLOON_S_MINFLT 3 /* Number of minor faults */ #define VIRTIO_BALLOON_S_MEMFREE 4 /* Total amount of free memory */ #define VIRTIO_BALLOON_S_MEMTOT 5 /* Total amount of memory */ -#define VIRTIO_BALLOON_S_NR 6 +#define VIRTIO_BALLOON_S_AVAIL 6 /* Available memory as in /proc */ +#define VIRTIO_BALLOON_S_NR 7 /* * Memory statistics structure. -- cgit v1.2.3-70-g09d2 From 3ed3a4f0ddffece942bb2661924d87be4ce63cb7 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 17 Mar 2016 14:19:11 -0700 Subject: mm: cleanup *pte_alloc* interfaces There are few things about *pte_alloc*() helpers worth cleaning up: - 'vma' argument is unused, let's drop it; - most __pte_alloc() callers do speculative check for pmd_none(), before taking ptl: let's introduce pte_alloc() macro which does the check. The only direct user of __pte_alloc left is userfaultfd, which has different expectation about atomicity wrt pmd. - pte_alloc_map() and pte_alloc_map_lock() are redefined using pte_alloc(). [sudeep.holla@arm.com: fix build for arm64 hugetlbpage] [sfr@canb.auug.org.au: fix arch/arm/mm/mmu.c some more] Signed-off-by: Kirill A. Shutemov Cc: Dave Hansen Signed-off-by: Sudeep Holla Acked-by: Kirill A. Shutemov Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/mmu.c | 6 +++--- arch/arm/mm/pgd.c | 2 +- arch/arm64/mm/hugetlbpage.c | 2 +- arch/ia64/mm/hugetlbpage.c | 2 +- arch/metag/mm/hugetlbpage.c | 2 +- arch/parisc/mm/hugetlbpage.c | 2 +- arch/sh/mm/hugetlbpage.c | 2 +- arch/sparc/mm/hugetlbpage.c | 2 +- arch/tile/mm/hugetlbpage.c | 2 +- arch/um/kernel/skas/mmu.c | 2 +- arch/unicore32/mm/pgd.c | 2 +- arch/x86/kernel/tboot.c | 2 +- include/linux/mm.h | 17 ++++++++--------- mm/memory.c | 8 +++----- mm/mremap.c | 3 +-- mm/userfaultfd.c | 3 +-- 16 files changed, 27 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 434d76f0b363..88fbe0d23ca6 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -732,7 +732,7 @@ static void *__init late_alloc(unsigned long sz) return ptr; } -static pte_t * __init pte_alloc(pmd_t *pmd, unsigned long addr, +static pte_t * __init arm_pte_alloc(pmd_t *pmd, unsigned long addr, unsigned long prot, void *(*alloc)(unsigned long sz)) { @@ -747,7 +747,7 @@ static pte_t * __init pte_alloc(pmd_t *pmd, unsigned long addr, static pte_t * __init early_pte_alloc(pmd_t *pmd, unsigned long addr, unsigned long prot) { - return pte_alloc(pmd, addr, prot, early_alloc); + return arm_pte_alloc(pmd, addr, prot, early_alloc); } static void __init alloc_init_pte(pmd_t *pmd, unsigned long addr, @@ -756,7 +756,7 @@ static void __init alloc_init_pte(pmd_t *pmd, unsigned long addr, void *(*alloc)(unsigned long sz), bool ng) { - pte_t *pte = pte_alloc(pmd, addr, type->prot_l1, alloc); + pte_t *pte = arm_pte_alloc(pmd, addr, type->prot_l1, alloc); do { set_pte_ext(pte, pfn_pte(pfn, __pgprot(type->prot_pte)), ng ? PTE_EXT_NG : 0); diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c index e683db1b90a3..b8d477321730 100644 --- a/arch/arm/mm/pgd.c +++ b/arch/arm/mm/pgd.c @@ -80,7 +80,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) if (!new_pmd) goto no_pmd; - new_pte = pte_alloc_map(mm, NULL, new_pmd, 0); + new_pte = pte_alloc_map(mm, new_pmd, 0); if (!new_pte) goto no_pte; diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index da30529bb1f6..589fd28e1fb5 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -124,7 +124,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, * will be no pte_unmap() to correspond with this * pte_alloc_map(). */ - pte = pte_alloc_map(mm, NULL, pmd, addr); + pte = pte_alloc_map(mm, pmd, addr); } else if (sz == PMD_SIZE) { if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && pud_none(*pud)) diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c index f50d4b3f501a..85de86d36fdf 100644 --- a/arch/ia64/mm/hugetlbpage.c +++ b/arch/ia64/mm/hugetlbpage.c @@ -38,7 +38,7 @@ huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) if (pud) { pmd = pmd_alloc(mm, pud, taddr); if (pmd) - pte = pte_alloc_map(mm, NULL, pmd, taddr); + pte = pte_alloc_map(mm, pmd, taddr); } return pte; } diff --git a/arch/metag/mm/hugetlbpage.c b/arch/metag/mm/hugetlbpage.c index 53f0f6c47027..b38700ae4e84 100644 --- a/arch/metag/mm/hugetlbpage.c +++ b/arch/metag/mm/hugetlbpage.c @@ -67,7 +67,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, pgd = pgd_offset(mm, addr); pud = pud_offset(pgd, addr); pmd = pmd_offset(pud, addr); - pte = pte_alloc_map(mm, NULL, pmd, addr); + pte = pte_alloc_map(mm, pmd, addr); pgd->pgd &= ~_PAGE_SZ_MASK; pgd->pgd |= _PAGE_SZHUGE; diff --git a/arch/parisc/mm/hugetlbpage.c b/arch/parisc/mm/hugetlbpage.c index 54ba39262b82..5d6eea925cf4 100644 --- a/arch/parisc/mm/hugetlbpage.c +++ b/arch/parisc/mm/hugetlbpage.c @@ -63,7 +63,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, if (pud) { pmd = pmd_alloc(mm, pud, addr); if (pmd) - pte = pte_alloc_map(mm, NULL, pmd, addr); + pte = pte_alloc_map(mm, pmd, addr); } return pte; } diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c index 6385f60209b6..cc948db74878 100644 --- a/arch/sh/mm/hugetlbpage.c +++ b/arch/sh/mm/hugetlbpage.c @@ -35,7 +35,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, if (pud) { pmd = pmd_alloc(mm, pud, addr); if (pmd) - pte = pte_alloc_map(mm, NULL, pmd, addr); + pte = pte_alloc_map(mm, pmd, addr); } } diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index 131eaf4ad7f5..4977800e9770 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -146,7 +146,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, if (pud) { pmd = pmd_alloc(mm, pud, addr); if (pmd) - pte = pte_alloc_map(mm, NULL, pmd, addr); + pte = pte_alloc_map(mm, pmd, addr); } return pte; } diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c index c034dc3fe2d4..e212c64682c5 100644 --- a/arch/tile/mm/hugetlbpage.c +++ b/arch/tile/mm/hugetlbpage.c @@ -77,7 +77,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, else { if (sz != PAGE_SIZE << huge_shift[HUGE_SHIFT_PAGE]) panic("Unexpected page size %#lx\n", sz); - return pte_alloc_map(mm, NULL, pmd, addr); + return pte_alloc_map(mm, pmd, addr); } } #else diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c index 9591a66aa5c5..3943e9d7d13d 100644 --- a/arch/um/kernel/skas/mmu.c +++ b/arch/um/kernel/skas/mmu.c @@ -31,7 +31,7 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc, if (!pmd) goto out_pmd; - pte = pte_alloc_map(mm, NULL, pmd, proc); + pte = pte_alloc_map(mm, pmd, proc); if (!pte) goto out_pte; diff --git a/arch/unicore32/mm/pgd.c b/arch/unicore32/mm/pgd.c index 2ade20d8eab3..c572a28c76c9 100644 --- a/arch/unicore32/mm/pgd.c +++ b/arch/unicore32/mm/pgd.c @@ -54,7 +54,7 @@ pgd_t *get_pgd_slow(struct mm_struct *mm) if (!new_pmd) goto no_pmd; - new_pte = pte_alloc_map(mm, NULL, new_pmd, 0); + new_pte = pte_alloc_map(mm, new_pmd, 0); if (!new_pte) goto no_pte; diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 91a4496db434..e72a07f20b05 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -135,7 +135,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, pmd = pmd_alloc(&tboot_mm, pud, vaddr); if (!pmd) return -1; - pte = pte_alloc_map(&tboot_mm, NULL, pmd, vaddr); + pte = pte_alloc_map(&tboot_mm, pmd, vaddr); if (!pte) return -1; set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); diff --git a/include/linux/mm.h b/include/linux/mm.h index db9df3f78de1..75d1907b9009 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1545,8 +1545,7 @@ static inline void mm_dec_nr_pmds(struct mm_struct *mm) } #endif -int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, - pmd_t *pmd, unsigned long address); +int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address); int __pte_alloc_kernel(pmd_t *pmd, unsigned long address); /* @@ -1672,15 +1671,15 @@ static inline void pgtable_page_dtor(struct page *page) pte_unmap(pte); \ } while (0) -#define pte_alloc_map(mm, vma, pmd, address) \ - ((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, vma, \ - pmd, address))? \ - NULL: pte_offset_map(pmd, address)) +#define pte_alloc(mm, pmd, address) \ + (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd, address)) + +#define pte_alloc_map(mm, pmd, address) \ + (pte_alloc(mm, pmd, address) ? NULL : pte_offset_map(pmd, address)) #define pte_alloc_map_lock(mm, pmd, address, ptlp) \ - ((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, NULL, \ - pmd, address))? \ - NULL: pte_offset_map_lock(mm, pmd, address, ptlp)) + (pte_alloc(mm, pmd, address) ? \ + NULL : pte_offset_map_lock(mm, pmd, address, ptlp)) #define pte_alloc_kernel(pmd, address) \ ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ diff --git a/mm/memory.c b/mm/memory.c index 0e247642ed5b..1974fc02c4d0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -562,8 +562,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, } } -int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, - pmd_t *pmd, unsigned long address) +int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { spinlock_t *ptl; pgtable_t new = pte_alloc_one(mm, address); @@ -3419,12 +3418,11 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, } /* - * Use __pte_alloc instead of pte_alloc_map, because we can't + * Use pte_alloc() instead of pte_alloc_map, because we can't * run pte_offset_map on the pmd, if an huge pmd could * materialize from under us from a different thread. */ - if (unlikely(pmd_none(*pmd)) && - unlikely(__pte_alloc(mm, vma, pmd, address))) + if (unlikely(pte_alloc(mm, pmd, address))) return VM_FAULT_OOM; /* * If a huge pmd materialized under us just retry later. Use diff --git a/mm/mremap.c b/mm/mremap.c index e30c8a6489a6..3fa0a467df66 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -213,8 +213,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, continue; VM_BUG_ON(pmd_trans_huge(*old_pmd)); } - if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma, - new_pmd, new_addr)) + if (pte_alloc(new_vma->vm_mm, new_pmd, new_addr)) break; next = (new_addr + PMD_SIZE) & PMD_MASK; if (extent > next - new_addr) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 806b0c758c5b..9f3a0290b273 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -230,8 +230,7 @@ retry: break; } if (unlikely(pmd_none(dst_pmdval)) && - unlikely(__pte_alloc(dst_mm, dst_vma, dst_pmd, - dst_addr))) { + unlikely(__pte_alloc(dst_mm, dst_pmd, dst_addr))) { err = -ENOMEM; break; } -- cgit v1.2.3-70-g09d2 From 795ae7a0de6b834a0cc202aa55c190ef81496665 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 17 Mar 2016 14:19:14 -0700 Subject: mm: scale kswapd watermarks in proportion to memory In machines with 140G of memory and enterprise flash storage, we have seen read and write bursts routinely exceed the kswapd watermarks and cause thundering herds in direct reclaim. Unfortunately, the only way to tune kswapd aggressiveness is through adjusting min_free_kbytes - the system's emergency reserves - which is entirely unrelated to the system's latency requirements. In order to get kswapd to maintain a 250M buffer of free memory, the emergency reserves need to be set to 1G. That is a lot of memory wasted for no good reason. On the other hand, it's reasonable to assume that allocation bursts and overall allocation concurrency scale with memory capacity, so it makes sense to make kswapd aggressiveness a function of that as well. Change the kswapd watermark scale factor from the currently fixed 25% of the tunable emergency reserve to a tunable 0.1% of memory. Beyond 1G of memory, this will produce bigger watermark steps than the current formula in default settings. Ensure that the new formula never chooses steps smaller than that, i.e. 25% of the emergency reserve. On a 140G machine, this raises the default watermark steps - the distance between min and low, and low and high - from 16M to 143M. Signed-off-by: Johannes Weiner Acked-by: Mel Gorman Acked-by: Rik van Riel Acked-by: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 18 ++++++++++++++++++ include/linux/mm.h | 1 + include/linux/mmzone.h | 2 ++ kernel/sysctl.c | 10 ++++++++++ mm/page_alloc.c | 29 +++++++++++++++++++++++++++-- 5 files changed, 58 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 89a887c76629..cb0368459da3 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -803,6 +803,24 @@ performance impact. Reclaim code needs to take various locks to find freeable directory and inode objects. With vfs_cache_pressure=1000, it will look for ten times more freeable objects than there are. +============================================================= + +watermark_scale_factor: + +This factor controls the aggressiveness of kswapd. It defines the +amount of memory left in a node/system before kswapd is woken up and +how much memory needs to be free before kswapd goes back to sleep. + +The unit is in fractions of 10,000. The default value of 10 means the +distances between watermarks are 0.1% of the available memory in the +node/system. The maximum value is 1000, or 10% of memory. + +A high rate of threads entering direct reclaim (allocstall) or kswapd +going to sleep prematurely (kswapd_low_wmark_hit_quickly) can indicate +that the number of free pages kswapd maintains for latency reasons is +too small for the allocation bursts occurring in the system. This knob +can then be used to tune kswapd aggressiveness accordingly. + ============================================================== zone_reclaim_mode: diff --git a/include/linux/mm.h b/include/linux/mm.h index 75d1907b9009..f7fd64227d3a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1889,6 +1889,7 @@ extern void zone_pcp_reset(struct zone *zone); /* page_alloc.c */ extern int min_free_kbytes; +extern int watermark_scale_factor; /* nommu.c */ extern atomic_long_t mmap_pages_allocated; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index bdd9a270a813..c60df9257cc7 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -841,6 +841,8 @@ static inline int is_highmem(struct zone *zone) struct ctl_table; int min_free_kbytes_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); +int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, + void __user *, size_t *, loff_t *); extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1]; int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f5102fabef7f..725587f10667 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -126,6 +126,7 @@ static int __maybe_unused two = 2; static int __maybe_unused four = 4; static unsigned long one_ul = 1; static int one_hundred = 100; +static int one_thousand = 1000; #ifdef CONFIG_PRINTK static int ten_thousand = 10000; #endif @@ -1403,6 +1404,15 @@ static struct ctl_table vm_table[] = { .proc_handler = min_free_kbytes_sysctl_handler, .extra1 = &zero, }, + { + .procname = "watermark_scale_factor", + .data = &watermark_scale_factor, + .maxlen = sizeof(watermark_scale_factor), + .mode = 0644, + .proc_handler = watermark_scale_factor_sysctl_handler, + .extra1 = &one, + .extra2 = &one_thousand, + }, { .procname = "percpu_pagelist_fraction", .data = &percpu_pagelist_fraction, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 941b802e11ec..d156310aedeb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -249,6 +249,7 @@ compound_page_dtor * const compound_page_dtors[] = { int min_free_kbytes = 1024; int user_min_free_kbytes = -1; +int watermark_scale_factor = 10; static unsigned long __meminitdata nr_kernel_pages; static unsigned long __meminitdata nr_all_pages; @@ -6347,8 +6348,17 @@ static void __setup_per_zone_wmarks(void) zone->watermark[WMARK_MIN] = tmp; } - zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); - zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); + /* + * Set the kswapd watermarks distance according to the + * scale factor in proportion to available memory, but + * ensure a minimum size on small systems. + */ + tmp = max_t(u64, tmp >> 2, + mult_frac(zone->managed_pages, + watermark_scale_factor, 10000)); + + zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; + zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; __mod_zone_page_state(zone, NR_ALLOC_BATCH, high_wmark_pages(zone) - low_wmark_pages(zone) - @@ -6489,6 +6499,21 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, return 0; } +int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int rc; + + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (rc) + return rc; + + if (write) + setup_per_zone_wmarks(); + + return 0; +} + #ifdef CONFIG_NUMA int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) -- cgit v1.2.3-70-g09d2 From b14a1ef58e8acb7fa64c1c8f745b21fd8a577aba Mon Sep 17 00:00:00 2001 From: Satoru Takeuchi Date: Thu, 17 Mar 2016 14:19:17 -0700 Subject: mm: remove unnecessary description about a non-exist gfp flag Since __GFP_NOACCOUNT was removed by commit 20b5c3039863 ("Revert 'gfp: add __GFP_NOACCOUNT'"), its description is not necessary. Signed-off-by: Satoru Takeuchi Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index bb16dfeb917e..c083d0820a87 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -105,8 +105,6 @@ struct vm_area_struct; * * __GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves. * This takes precedence over the __GFP_MEMALLOC flag if both are set. - * - * __GFP_NOACCOUNT ignores the accounting for kmemcg limit enforcement. */ #define __GFP_ATOMIC ((__force gfp_t)___GFP_ATOMIC) #define __GFP_HIGH ((__force gfp_t)___GFP_HIGH) -- cgit v1.2.3-70-g09d2 From 444eb2a449ef36fe115431ed7b71467c4563c7f1 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 17 Mar 2016 14:19:23 -0700 Subject: mm: thp: set THP defrag by default to madvise and add a stall-free defrag option THP defrag is enabled by default to direct reclaim/compact but not wake kswapd in the event of a THP allocation failure. The problem is that THP allocation requests potentially enter reclaim/compaction. This potentially incurs a severe stall that is not guaranteed to be offset by reduced TLB misses. While there has been considerable effort to reduce the impact of reclaim/compaction, it is still a high cost and workloads that should fit in memory fail to do so. Specifically, a simple anon/file streaming workload will enter direct reclaim on NUMA at least even though the working set size is 80% of RAM. It's been years and it's time to throw in the towel. First, this patch defines THP defrag as follows; madvise: A failed allocation will direct reclaim/compact if the application requests it never: Neither reclaim/compact nor wake kswapd defer: A failed allocation will wake kswapd/kcompactd always: A failed allocation will direct reclaim/compact (historical behaviour) khugepaged defrag will enter direct/reclaim but not wake kswapd. Next it sets the default defrag option to be "madvise" to only enter direct reclaim/compaction for applications that specifically requested it. Lastly, it removes a check from the page allocator slowpath that is related to __GFP_THISNODE to allow "defer" to work. The callers that really cares are slub/slab and they are updated accordingly. The slab one may be surprising because it also corrects a comment as kswapd was never woken up by that path. This means that a THP fault will no longer stall for most applications by default and the ideal for most users that get THP if they are immediately available. There are still options for users that prefer a stall at startup of a new application by either restoring historical behaviour with "always" or pick a half-way point with "defer" where kswapd does some of the work in the background and wakes kcompactd if necessary. THP defrag for khugepaged remains enabled and will enter direct/reclaim but no wakeup kswapd or kcompactd. After this patch a THP allocation failure will quickly fallback and rely on khugepaged to recover the situation at some time in the future. In some cases, this will reduce THP usage but the benefit of THP is hard to measure and not a universal win where as a stall to reclaim/compaction is definitely measurable and can be painful. The first test for this is using "usemem" to read a large file and write a large anonymous mapping (to avoid the zero page) multiple times. The total size of the mappings is 80% of RAM and the benchmark simply measures how long it takes to complete. It uses multiple threads to see if that is a factor. On UMA, the performance is almost identical so is not reported but on NUMA, we see this usemem 4.4.0 4.4.0 kcompactd-v1r1 nodefrag-v1r3 Amean System-1 102.86 ( 0.00%) 46.81 ( 54.50%) Amean System-4 37.85 ( 0.00%) 34.02 ( 10.12%) Amean System-7 48.12 ( 0.00%) 46.89 ( 2.56%) Amean System-12 51.98 ( 0.00%) 56.96 ( -9.57%) Amean System-21 80.16 ( 0.00%) 79.05 ( 1.39%) Amean System-30 110.71 ( 0.00%) 107.17 ( 3.20%) Amean System-48 127.98 ( 0.00%) 124.83 ( 2.46%) Amean Elapsd-1 185.84 ( 0.00%) 105.51 ( 43.23%) Amean Elapsd-4 26.19 ( 0.00%) 25.58 ( 2.33%) Amean Elapsd-7 21.65 ( 0.00%) 21.62 ( 0.16%) Amean Elapsd-12 18.58 ( 0.00%) 17.94 ( 3.43%) Amean Elapsd-21 17.53 ( 0.00%) 16.60 ( 5.33%) Amean Elapsd-30 17.45 ( 0.00%) 17.13 ( 1.84%) Amean Elapsd-48 15.40 ( 0.00%) 15.27 ( 0.82%) For a single thread, the benchmark completes 43.23% faster with this patch applied with smaller benefits as the thread increases. Similar, notice the large reduction in most cases in system CPU usage. The overall CPU time is 4.4.0 4.4.0 kcompactd-v1r1 nodefrag-v1r3 User 10357.65 10438.33 System 3988.88 3543.94 Elapsed 2203.01 1634.41 Which is substantial. Now, the reclaim figures 4.4.0 4.4.0 kcompactd-v1r1nodefrag-v1r3 Minor Faults 128458477 278352931 Major Faults 2174976 225 Swap Ins 16904701 0 Swap Outs 17359627 0 Allocation stalls 43611 0 DMA allocs 0 0 DMA32 allocs 19832646 19448017 Normal allocs 614488453 580941839 Movable allocs 0 0 Direct pages scanned 24163800 0 Kswapd pages scanned 0 0 Kswapd pages reclaimed 0 0 Direct pages reclaimed 20691346 0 Compaction stalls 42263 0 Compaction success 938 0 Compaction failures 41325 0 This patch eliminates almost all swapping and direct reclaim activity. There is still overhead but it's from NUMA balancing which does not identify that it's pointless trying to do anything with this workload. I also tried the thpscale benchmark which forces a corner case where compaction can be used heavily and measures the latency of whether base or huge pages were used thpscale Fault Latencies 4.4.0 4.4.0 kcompactd-v1r1 nodefrag-v1r3 Amean fault-base-1 5288.84 ( 0.00%) 2817.12 ( 46.73%) Amean fault-base-3 6365.53 ( 0.00%) 3499.11 ( 45.03%) Amean fault-base-5 6526.19 ( 0.00%) 4363.06 ( 33.15%) Amean fault-base-7 7142.25 ( 0.00%) 4858.08 ( 31.98%) Amean fault-base-12 13827.64 ( 0.00%) 10292.11 ( 25.57%) Amean fault-base-18 18235.07 ( 0.00%) 13788.84 ( 24.38%) Amean fault-base-24 21597.80 ( 0.00%) 24388.03 (-12.92%) Amean fault-base-30 26754.15 ( 0.00%) 19700.55 ( 26.36%) Amean fault-base-32 26784.94 ( 0.00%) 19513.57 ( 27.15%) Amean fault-huge-1 4223.96 ( 0.00%) 2178.57 ( 48.42%) Amean fault-huge-3 2194.77 ( 0.00%) 2149.74 ( 2.05%) Amean fault-huge-5 2569.60 ( 0.00%) 2346.95 ( 8.66%) Amean fault-huge-7 3612.69 ( 0.00%) 2997.70 ( 17.02%) Amean fault-huge-12 3301.75 ( 0.00%) 6727.02 (-103.74%) Amean fault-huge-18 6696.47 ( 0.00%) 6685.72 ( 0.16%) Amean fault-huge-24 8000.72 ( 0.00%) 9311.43 (-16.38%) Amean fault-huge-30 13305.55 ( 0.00%) 9750.45 ( 26.72%) Amean fault-huge-32 9981.71 ( 0.00%) 10316.06 ( -3.35%) The average time to fault pages is substantially reduced in the majority of caseds but with the obvious caveat that fewer THPs are actually used in this adverse workload 4.4.0 4.4.0 kcompactd-v1r1 nodefrag-v1r3 Percentage huge-1 0.71 ( 0.00%) 14.04 (1865.22%) Percentage huge-3 10.77 ( 0.00%) 33.05 (206.85%) Percentage huge-5 60.39 ( 0.00%) 38.51 (-36.23%) Percentage huge-7 45.97 ( 0.00%) 34.57 (-24.79%) Percentage huge-12 68.12 ( 0.00%) 40.07 (-41.17%) Percentage huge-18 64.93 ( 0.00%) 47.82 (-26.35%) Percentage huge-24 62.69 ( 0.00%) 44.23 (-29.44%) Percentage huge-30 43.49 ( 0.00%) 55.38 ( 27.34%) Percentage huge-32 50.72 ( 0.00%) 51.90 ( 2.35%) 4.4.0 4.4.0 kcompactd-v1r1nodefrag-v1r3 Minor Faults 37429143 47564000 Major Faults 1916 1558 Swap Ins 1466 1079 Swap Outs 2936863 149626 Allocation stalls 62510 3 DMA allocs 0 0 DMA32 allocs 6566458 6401314 Normal allocs 216361697 216538171 Movable allocs 0 0 Direct pages scanned 25977580 17998 Kswapd pages scanned 0 3638931 Kswapd pages reclaimed 0 207236 Direct pages reclaimed 8833714 88 Compaction stalls 103349 5 Compaction success 270 4 Compaction failures 103079 1 Note again that while this does swap as it's an aggressive workload, the direct relcim activity and allocation stalls is substantially reduced. There is some kswapd activity but ftrace showed that the kswapd activity was due to normal wakeups from 4K pages being allocated. Compaction-related stalls and activity are almost eliminated. I also tried the stutter benchmark. For this, I do not have figures for NUMA but it's something that does impact UMA so I'll report what is available stutter 4.4.0 4.4.0 kcompactd-v1r1 nodefrag-v1r3 Min mmap 7.3571 ( 0.00%) 7.3438 ( 0.18%) 1st-qrtle mmap 7.5278 ( 0.00%) 17.9200 (-138.05%) 2nd-qrtle mmap 7.6818 ( 0.00%) 21.6055 (-181.25%) 3rd-qrtle mmap 11.0889 ( 0.00%) 21.8881 (-97.39%) Max-90% mmap 27.8978 ( 0.00%) 22.1632 ( 20.56%) Max-93% mmap 28.3202 ( 0.00%) 22.3044 ( 21.24%) Max-95% mmap 28.5600 ( 0.00%) 22.4580 ( 21.37%) Max-99% mmap 29.6032 ( 0.00%) 25.5216 ( 13.79%) Max mmap 4109.7289 ( 0.00%) 4813.9832 (-17.14%) Mean mmap 12.4474 ( 0.00%) 19.3027 (-55.07%) This benchmark is trying to fault an anonymous mapping while there is a heavy IO load -- a scenario that desktop users used to complain about frequently. This shows a mix because the ideal case of mapping with THP is not hit as often. However, note that 99% of the mappings complete 13.79% faster. The CPU usage here is particularly interesting 4.4.0 4.4.0 kcompactd-v1r1nodefrag-v1r3 User 67.50 0.99 System 1327.88 91.30 Elapsed 2079.00 2128.98 And once again we look at the reclaim figures 4.4.0 4.4.0 kcompactd-v1r1nodefrag-v1r3 Minor Faults 335241922 1314582827 Major Faults 715 819 Swap Ins 0 0 Swap Outs 0 0 Allocation stalls 532723 0 DMA allocs 0 0 DMA32 allocs 1822364341 1177950222 Normal allocs 1815640808 1517844854 Movable allocs 0 0 Direct pages scanned 21892772 0 Kswapd pages scanned 20015890 41879484 Kswapd pages reclaimed 19961986 41822072 Direct pages reclaimed 21892741 0 Compaction stalls 1065755 0 Compaction success 514 0 Compaction failures 1065241 0 Allocation stalls and all direct reclaim activity is eliminated as well as compaction-related stalls. THP gives impressive gains in some cases but only if they are quickly available. We're not going to reach the point where they are completely free so lets take the costs out of the fast paths finally and defer the cost to kswapd, kcompactd and khugepaged where it belongs. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Acked-by: Johannes Weiner Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/transhuge.txt | 17 +++++++ include/linux/gfp.h | 2 +- include/linux/huge_mm.h | 9 +--- mm/huge_memory.c | 101 +++++++++++++++++++++++++++-------------- mm/page_alloc.c | 8 ---- mm/slab.c | 8 ++-- mm/slub.c | 2 +- 7 files changed, 91 insertions(+), 56 deletions(-) (limited to 'include') diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt index 0dc8632aa01e..d9cb65cf5cfd 100644 --- a/Documentation/vm/transhuge.txt +++ b/Documentation/vm/transhuge.txt @@ -113,9 +113,26 @@ guaranteed, but it may be more likely in case the allocation is for a MADV_HUGEPAGE region. echo always >/sys/kernel/mm/transparent_hugepage/defrag +echo defer >/sys/kernel/mm/transparent_hugepage/defrag echo madvise >/sys/kernel/mm/transparent_hugepage/defrag echo never >/sys/kernel/mm/transparent_hugepage/defrag +"always" means that an application requesting THP will stall on allocation +failure and directly reclaim pages and compact memory in an effort to +allocate a THP immediately. This may be desirable for virtual machines +that benefit heavily from THP use and are willing to delay the VM start +to utilise them. + +"defer" means that an application will wake kswapd in the background +to reclaim pages and wake kcompact to compact memory so that THP is +available in the near future. It's the responsibility of khugepaged +to then install the THP pages later. + +"madvise" will enter direct reclaim like "always" but only for regions +that are have used madvise(MADV_HUGEPAGE). This is the default behaviour. + +"never" should be self-explanatory. + By default kernel tries to use huge zero page on read page fault. It's possible to disable huge zero page by writing 0 or enable it back by writing 1: diff --git a/include/linux/gfp.h b/include/linux/gfp.h index c083d0820a87..11d56c6e7ef2 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -257,7 +257,7 @@ struct vm_area_struct; #define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE) #define GFP_TRANSHUGE ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) & \ - ~__GFP_KSWAPD_RECLAIM) + ~__GFP_RECLAIM) /* Convert GFP flags to their corresponding migrate type */ #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 459fd25b378e..a4cecb4801ec 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -41,7 +41,8 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *, enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_FLAG, TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, - TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG, TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG, @@ -71,12 +72,6 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma); ((__vma)->vm_flags & VM_HUGEPAGE))) && \ !((__vma)->vm_flags & VM_NOHUGEPAGE) && \ !is_vma_temporary_stack(__vma)) -#define transparent_hugepage_defrag(__vma) \ - ((transparent_hugepage_flags & \ - (1<vm_flags & VM_HUGEPAGE)) #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ (1<vm_flags & VM_HUGEPAGE)) + reclaim_flags = __GFP_DIRECT_RECLAIM; + else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) + reclaim_flags = __GFP_KSWAPD_RECLAIM; + else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) + reclaim_flags = __GFP_DIRECT_RECLAIM; + + return GFP_TRANSHUGE | reclaim_flags; +} + +/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */ +static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) { - return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_RECLAIM)) | extra_gfp; + return GFP_TRANSHUGE | (khugepaged_defrag() ? __GFP_DIRECT_RECLAIM : 0); } /* Caller must hold page table lock. */ @@ -919,7 +950,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, } return ret; } - gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); + gfp = alloc_hugepage_direct_gfpmask(vma); page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); if (unlikely(!page)) { count_vm_event(THP_FAULT_FALLBACK); @@ -1279,7 +1310,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, alloc: if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) { - huge_gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); + huge_gfp = alloc_hugepage_direct_gfpmask(vma); new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); } else new_page = NULL; @@ -2249,11 +2280,12 @@ static int khugepaged_find_target_node(void) return 0; } -static inline struct page *alloc_hugepage(int defrag) +static inline struct page *alloc_khugepaged_hugepage(void) { struct page *page; - page = alloc_pages(alloc_hugepage_gfpmask(defrag, 0), HPAGE_PMD_ORDER); + page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(), + HPAGE_PMD_ORDER); if (page) prep_transhuge_page(page); return page; @@ -2264,7 +2296,7 @@ static struct page *khugepaged_alloc_hugepage(bool *wait) struct page *hpage; do { - hpage = alloc_hugepage(khugepaged_defrag()); + hpage = alloc_khugepaged_hugepage(); if (!hpage) { count_vm_event(THP_COLLAPSE_ALLOC_FAILED); if (!*wait) @@ -2335,8 +2367,7 @@ static void collapse_huge_page(struct mm_struct *mm, VM_BUG_ON(address & ~HPAGE_PMD_MASK); /* Only allocate from the target node */ - gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) | - __GFP_THISNODE; + gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_OTHER_NODE | __GFP_THISNODE; /* release the mmap_sem read lock. */ new_page = khugepaged_alloc_page(hpage, gfp, mm, address, node); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d156310aedeb..096a00d98a45 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3119,14 +3119,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) gfp_mask &= ~__GFP_ATOMIC; - /* - * If this allocation cannot block and it is for a specific node, then - * fail early. There's no need to wakeup kswapd or retry for a - * speculative node-specific allocation. - */ - if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !can_direct_reclaim) - goto nopage; - retry: if (gfp_mask & __GFP_KSWAPD_RECLAIM) wake_all_kswapds(order, ac); diff --git a/mm/slab.c b/mm/slab.c index 56dd0df2a8ce..e1f6c27c3db5 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -670,7 +670,7 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep, static inline gfp_t gfp_exact_node(gfp_t flags) { - return flags; + return flags & ~__GFP_NOFAIL; } #else /* CONFIG_NUMA */ @@ -841,12 +841,12 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) } /* - * Construct gfp mask to allocate from a specific node but do not direct reclaim - * or warn about failures. kswapd may still wake to reclaim in the background. + * Construct gfp mask to allocate from a specific node but do not reclaim or + * warn about failures. */ static inline gfp_t gfp_exact_node(gfp_t flags) { - return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_DIRECT_RECLAIM; + return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~(__GFP_RECLAIM|__GFP_NOFAIL); } #endif diff --git a/mm/slub.c b/mm/slub.c index 2f2f04d39104..64ed5f3a3046 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1426,7 +1426,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) */ alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min)) - alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_DIRECT_RECLAIM; + alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~(__GFP_RECLAIM|__GFP_NOFAIL); page = alloc_slab_page(s, alloc_gfp, node, oo); if (unlikely(!page)) { -- cgit v1.2.3-70-g09d2 From fe896d1878949ea92ba547587bc3075cc688fb8f Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 17 Mar 2016 14:19:26 -0700 Subject: mm: introduce page reference manipulation functions The success of CMA allocation largely depends on the success of migration and key factor of it is page reference count. Until now, page reference is manipulated by direct calling atomic functions so we cannot follow up who and where manipulate it. Then, it is hard to find actual reason of CMA allocation failure. CMA allocation should be guaranteed to succeed so finding offending place is really important. In this patch, call sites where page reference is manipulated are converted to introduced wrapper function. This is preparation step to add tracepoint to each page reference manipulation function. With this facility, we can easily find reason of CMA allocation failure. There is no functional change in this patch. In addition, this patch also converts reference read sites. It will help a second step that renames page._count to something else and prevents later attempt to direct access to it (Suggested by Andrew). Signed-off-by: Joonsoo Kim Acked-by: Michal Nazarewicz Acked-by: Vlastimil Babka Cc: Minchan Kim Cc: Mel Gorman Cc: "Kirill A. Shutemov" Cc: Sergey Senozhatsky Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/mm/gup.c | 2 +- arch/powerpc/mm/mmu_context_hash64.c | 3 +- arch/powerpc/mm/pgtable_64.c | 2 +- arch/powerpc/platforms/512x/mpc512x_shared.c | 2 +- arch/x86/mm/gup.c | 2 +- drivers/block/aoe/aoecmd.c | 4 +- drivers/net/ethernet/freescale/gianfar.c | 2 +- drivers/net/ethernet/intel/fm10k/fm10k_main.c | 2 +- drivers/net/ethernet/intel/igb/igb_main.c | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 2 +- drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 2 +- drivers/net/ethernet/mellanox/mlx4/en_rx.c | 9 ++- drivers/net/ethernet/sun/niu.c | 2 +- fs/nilfs2/page.c | 2 +- include/linux/mm.h | 25 ++----- include/linux/page_ref.h | 85 +++++++++++++++++++++++ include/linux/pagemap.h | 19 +---- mm/debug.c | 2 +- mm/huge_memory.c | 6 +- mm/internal.h | 7 +- mm/memory_hotplug.c | 4 +- mm/migrate.c | 10 +-- mm/page_alloc.c | 12 ++-- mm/vmscan.c | 6 +- net/core/sock.c | 2 +- 25 files changed, 134 insertions(+), 82 deletions(-) create mode 100644 include/linux/page_ref.h (limited to 'include') diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c index 1afd87c999b0..6cdffc76735c 100644 --- a/arch/mips/mm/gup.c +++ b/arch/mips/mm/gup.c @@ -64,7 +64,7 @@ static inline void get_head_page_multiple(struct page *page, int nr) { VM_BUG_ON(page != compound_head(page)); VM_BUG_ON(page_count(page) == 0); - atomic_add(nr, &page->_count); + page_ref_add(page, nr); SetPageReferenced(page); } diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c index 4e4efbc2658e..9ca6fe16cb29 100644 --- a/arch/powerpc/mm/mmu_context_hash64.c +++ b/arch/powerpc/mm/mmu_context_hash64.c @@ -118,8 +118,7 @@ static void destroy_pagetable_page(struct mm_struct *mm) /* drop all the pending references */ count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT; /* We allow PTE_FRAG_NR fragments from a PTE page */ - count = atomic_sub_return(PTE_FRAG_NR - count, &page->_count); - if (!count) { + if (page_ref_sub_and_test(page, PTE_FRAG_NR - count)) { pgtable_page_dtor(page); free_hot_cold_page(page, 0); } diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index cdf2123d46db..d9cc66cbdbb7 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -403,7 +403,7 @@ static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel) * count. */ if (likely(!mm->context.pte_frag)) { - atomic_set(&page->_count, PTE_FRAG_NR); + set_page_count(page, PTE_FRAG_NR); mm->context.pte_frag = ret + PTE_FRAG_SIZE; } spin_unlock(&mm->page_table_lock); diff --git a/arch/powerpc/platforms/512x/mpc512x_shared.c b/arch/powerpc/platforms/512x/mpc512x_shared.c index 711f3d352af7..452da2391153 100644 --- a/arch/powerpc/platforms/512x/mpc512x_shared.c +++ b/arch/powerpc/platforms/512x/mpc512x_shared.c @@ -188,7 +188,7 @@ static struct fsl_diu_shared_fb __attribute__ ((__aligned__(8))) diu_shared_fb; static inline void mpc512x_free_bootmem(struct page *page) { BUG_ON(PageTail(page)); - BUG_ON(atomic_read(&page->_count) > 1); + BUG_ON(page_ref_count(page) > 1); free_reserved_page(page); } diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index d8a798d8bf50..f8d0b5e8bdfd 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -131,7 +131,7 @@ static inline void get_head_page_multiple(struct page *page, int nr) { VM_BUG_ON_PAGE(page != compound_head(page), page); VM_BUG_ON_PAGE(page_count(page) == 0, page); - atomic_add(nr, &page->_count); + page_ref_add(page, nr); SetPageReferenced(page); } diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index d048d2009e89..437b3a822f44 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -875,7 +875,7 @@ bio_pageinc(struct bio *bio) * compound pages is no longer allowed by the kernel. */ page = compound_head(bv.bv_page); - atomic_inc(&page->_count); + page_ref_inc(page); } } @@ -888,7 +888,7 @@ bio_pagedec(struct bio *bio) bio_for_each_segment(bv, bio, iter) { page = compound_head(bv.bv_page); - atomic_dec(&page->_count); + page_ref_dec(page); } } diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c index b9ecf197ad11..f21b2c479780 100644 --- a/drivers/net/ethernet/freescale/gianfar.c +++ b/drivers/net/ethernet/freescale/gianfar.c @@ -2944,7 +2944,7 @@ static bool gfar_add_rx_frag(struct gfar_rx_buff *rxb, u32 lstatus, /* change offset to the other half */ rxb->page_offset ^= GFAR_RXB_TRUESIZE; - atomic_inc(&page->_count); + page_ref_inc(page); return true; } diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c b/drivers/net/ethernet/intel/fm10k/fm10k_main.c index b243c3cbe68f..b4547ebed774 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c @@ -243,7 +243,7 @@ static bool fm10k_can_reuse_rx_page(struct fm10k_rx_buffer *rx_buffer, /* Even if we own the page, we are not allowed to use atomic_set() * This would break get_page_unless_zero() users. */ - atomic_inc(&page->_count); + page_ref_inc(page); return true; } diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 31e5f3942839..5b4ad1ad4d5f 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -6630,7 +6630,7 @@ static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer, /* Even if we own the page, we are not allowed to use atomic_set() * This would break get_page_unless_zero() users. */ - atomic_inc(&page->_count); + page_ref_inc(page); return true; } diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index c4003a88bbf6..e6035ff6b861 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -1942,7 +1942,7 @@ static bool ixgbe_add_rx_frag(struct ixgbe_ring *rx_ring, /* Even if we own the page, we are not allowed to use atomic_set() * This would break get_page_unless_zero() users. */ - atomic_inc(&page->_count); + page_ref_inc(page); return true; } diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 3558f019b631..0ea14c0a2e74 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -837,7 +837,7 @@ add_tail_frag: /* Even if we own the page, we are not allowed to use atomic_set() * This would break get_page_unless_zero() users. */ - atomic_inc(&page->_count); + page_ref_inc(page); return true; } diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index 41440b2b20a3..86bcfe510e4e 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -82,8 +82,7 @@ static int mlx4_alloc_pages(struct mlx4_en_priv *priv, /* Not doing get_page() for each frag is a big win * on asymetric workloads. Note we can not use atomic_set(). */ - atomic_add(page_alloc->page_size / frag_info->frag_stride - 1, - &page->_count); + page_ref_add(page, page_alloc->page_size / frag_info->frag_stride - 1); return 0; } @@ -127,7 +126,7 @@ out: dma_unmap_page(priv->ddev, page_alloc[i].dma, page_alloc[i].page_size, PCI_DMA_FROMDEVICE); page = page_alloc[i].page; - atomic_set(&page->_count, 1); + set_page_count(page, 1); put_page(page); } } @@ -165,7 +164,7 @@ static int mlx4_en_init_allocator(struct mlx4_en_priv *priv, en_dbg(DRV, priv, " frag %d allocator: - size:%d frags:%d\n", i, ring->page_alloc[i].page_size, - atomic_read(&ring->page_alloc[i].page->_count)); + page_ref_count(ring->page_alloc[i].page)); } return 0; @@ -177,7 +176,7 @@ out: dma_unmap_page(priv->ddev, page_alloc->dma, page_alloc->page_size, PCI_DMA_FROMDEVICE); page = page_alloc->page; - atomic_set(&page->_count, 1); + set_page_count(page, 1); put_page(page); page_alloc->page = NULL; } diff --git a/drivers/net/ethernet/sun/niu.c b/drivers/net/ethernet/sun/niu.c index ab6051a43134..9cc45649f477 100644 --- a/drivers/net/ethernet/sun/niu.c +++ b/drivers/net/ethernet/sun/niu.c @@ -3341,7 +3341,7 @@ static int niu_rbr_add_page(struct niu *np, struct rx_ring_info *rp, niu_hash_page(rp, page, addr); if (rp->rbr_blocks_per_page > 1) - atomic_add(rp->rbr_blocks_per_page - 1, &page->_count); + page_ref_add(page, rp->rbr_blocks_per_page - 1); for (i = 0; i < rp->rbr_blocks_per_page; i++) { __le32 *rbr = &rp->rbr[start_index + i]; diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 45d650addd56..c20df77eff99 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -180,7 +180,7 @@ void nilfs_page_bug(struct page *page) printk(KERN_CRIT "NILFS_PAGE_BUG(%p): cnt=%d index#=%llu flags=0x%lx " "mapping=%p ino=%lu\n", - page, atomic_read(&page->_count), + page, page_ref_count(page), (unsigned long long)page->index, page->flags, m, ino); if (page_has_buffers(page)) { diff --git a/include/linux/mm.h b/include/linux/mm.h index f7fd64227d3a..997fc2e5d9d8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -22,6 +22,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -386,8 +387,8 @@ static inline int pmd_devmap(pmd_t pmd) */ static inline int put_page_testzero(struct page *page) { - VM_BUG_ON_PAGE(atomic_read(&page->_count) == 0, page); - return atomic_dec_and_test(&page->_count); + VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); + return page_ref_dec_and_test(page); } /* @@ -398,7 +399,7 @@ static inline int put_page_testzero(struct page *page) */ static inline int get_page_unless_zero(struct page *page) { - return atomic_inc_not_zero(&page->_count); + return page_ref_add_unless(page, 1, 0); } extern int page_is_ram(unsigned long pfn); @@ -486,11 +487,6 @@ static inline int total_mapcount(struct page *page) } #endif -static inline int page_count(struct page *page) -{ - return atomic_read(&compound_head(page)->_count); -} - static inline struct page *virt_to_head_page(const void *x) { struct page *page = virt_to_page(x); @@ -498,15 +494,6 @@ static inline struct page *virt_to_head_page(const void *x) return compound_head(page); } -/* - * Setup the page count before being freed into the page allocator for - * the first time (boot or memory hotplug) - */ -static inline void init_page_count(struct page *page) -{ - atomic_set(&page->_count, 1); -} - void __put_page(struct page *page); void put_pages_list(struct list_head *pages); @@ -716,8 +703,8 @@ static inline void get_page(struct page *page) * Getting a normal page or the head of a compound page * requires to already have an elevated page->_count. */ - VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page); - atomic_inc(&page->_count); + VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page); + page_ref_inc(page); if (unlikely(is_zone_device_page(page))) get_zone_device_page(page); diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h new file mode 100644 index 000000000000..30f5817f6b8e --- /dev/null +++ b/include/linux/page_ref.h @@ -0,0 +1,85 @@ +#ifndef _LINUX_PAGE_REF_H +#define _LINUX_PAGE_REF_H + +#include +#include +#include + +static inline int page_ref_count(struct page *page) +{ + return atomic_read(&page->_count); +} + +static inline int page_count(struct page *page) +{ + return atomic_read(&compound_head(page)->_count); +} + +static inline void set_page_count(struct page *page, int v) +{ + atomic_set(&page->_count, v); +} + +/* + * Setup the page count before being freed into the page allocator for + * the first time (boot or memory hotplug) + */ +static inline void init_page_count(struct page *page) +{ + set_page_count(page, 1); +} + +static inline void page_ref_add(struct page *page, int nr) +{ + atomic_add(nr, &page->_count); +} + +static inline void page_ref_sub(struct page *page, int nr) +{ + atomic_sub(nr, &page->_count); +} + +static inline void page_ref_inc(struct page *page) +{ + atomic_inc(&page->_count); +} + +static inline void page_ref_dec(struct page *page) +{ + atomic_dec(&page->_count); +} + +static inline int page_ref_sub_and_test(struct page *page, int nr) +{ + return atomic_sub_and_test(nr, &page->_count); +} + +static inline int page_ref_dec_and_test(struct page *page) +{ + return atomic_dec_and_test(&page->_count); +} + +static inline int page_ref_dec_return(struct page *page) +{ + return atomic_dec_return(&page->_count); +} + +static inline int page_ref_add_unless(struct page *page, int nr, int u) +{ + return atomic_add_unless(&page->_count, nr, u); +} + +static inline int page_ref_freeze(struct page *page, int count) +{ + return likely(atomic_cmpxchg(&page->_count, count, 0) == count); +} + +static inline void page_ref_unfreeze(struct page *page, int count) +{ + VM_BUG_ON_PAGE(page_count(page) != 0, page); + VM_BUG_ON(count == 0); + + atomic_set(&page->_count, count); +} + +#endif diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 183b15ea052b..1ebd65c91422 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -165,7 +165,7 @@ static inline int page_cache_get_speculative(struct page *page) * SMP requires. */ VM_BUG_ON_PAGE(page_count(page) == 0, page); - atomic_inc(&page->_count); + page_ref_inc(page); #else if (unlikely(!get_page_unless_zero(page))) { @@ -194,10 +194,10 @@ static inline int page_cache_add_speculative(struct page *page, int count) VM_BUG_ON(!in_atomic()); # endif VM_BUG_ON_PAGE(page_count(page) == 0, page); - atomic_add(count, &page->_count); + page_ref_add(page, count); #else - if (unlikely(!atomic_add_unless(&page->_count, count, 0))) + if (unlikely(!page_ref_add_unless(page, count, 0))) return 0; #endif VM_BUG_ON_PAGE(PageCompound(page) && page != compound_head(page), page); @@ -205,19 +205,6 @@ static inline int page_cache_add_speculative(struct page *page, int count) return 1; } -static inline int page_freeze_refs(struct page *page, int count) -{ - return likely(atomic_cmpxchg(&page->_count, count, 0) == count); -} - -static inline void page_unfreeze_refs(struct page *page, int count) -{ - VM_BUG_ON_PAGE(page_count(page) != 0, page); - VM_BUG_ON(count == 0); - - atomic_set(&page->_count, count); -} - #ifdef CONFIG_NUMA extern struct page *__page_cache_alloc(gfp_t gfp); #else diff --git a/mm/debug.c b/mm/debug.c index df7247b0b532..8865bfb41b0b 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -43,7 +43,7 @@ const struct trace_print_flags vmaflag_names[] = { void __dump_page(struct page *page, const char *reason) { pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx", - page, atomic_read(&page->_count), page_mapcount(page), + page, page_ref_count(page), page_mapcount(page), page->mapping, page->index); if (PageCompound(page)) pr_cont(" compound_mapcount: %d", compound_mapcount(page)); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e08b1659ff19..bb944c771c82 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2888,7 +2888,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, page = pmd_page(*pmd); VM_BUG_ON_PAGE(!page_count(page), page); - atomic_add(HPAGE_PMD_NR - 1, &page->_count); + page_ref_add(page, HPAGE_PMD_NR - 1); write = pmd_write(*pmd); young = pmd_young(*pmd); dirty = pmd_dirty(*pmd); @@ -3257,7 +3257,7 @@ static void __split_huge_page_tail(struct page *head, int tail, struct page *page_tail = head + tail; VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail); - VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail); + VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail); /* * tail_page->_count is zero and not changing from under us. But @@ -3270,7 +3270,7 @@ static void __split_huge_page_tail(struct page *head, int tail, * atomic_set() here would be safe on all archs (and not only on x86), * it's safer to use atomic_inc(). */ - atomic_inc(&page_tail->_count); + page_ref_inc(page_tail); page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; page_tail->flags |= (head->flags & diff --git a/mm/internal.h b/mm/internal.h index 4042a8a05672..57d7b0e839f0 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -38,11 +38,6 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); -static inline void set_page_count(struct page *page, int v) -{ - atomic_set(&page->_count, v); -} - extern int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, pgoff_t offset, unsigned long nr_to_read, unsigned long lookahead_size); @@ -64,7 +59,7 @@ static inline unsigned long ra_submit(struct file_ra_state *ra, static inline void set_page_refcounted(struct page *page) { VM_BUG_ON_PAGE(PageTail(page), page); - VM_BUG_ON_PAGE(atomic_read(&page->_count), page); + VM_BUG_ON_PAGE(page_ref_count(page), page); set_page_count(page, 1); } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c832ef3565cc..e62aa078f5c9 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -167,7 +167,7 @@ void get_page_bootmem(unsigned long info, struct page *page, page->lru.next = (struct list_head *) type; SetPagePrivate(page); set_page_private(page, info); - atomic_inc(&page->_count); + page_ref_inc(page); } void put_page_bootmem(struct page *page) @@ -178,7 +178,7 @@ void put_page_bootmem(struct page *page) BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); - if (atomic_dec_return(&page->_count) == 1) { + if (page_ref_dec_return(page) == 1) { ClearPagePrivate(page); set_page_private(page, 0); INIT_LIST_HEAD(&page->lru); diff --git a/mm/migrate.c b/mm/migrate.c index fdaf0818fb30..577c94b8e959 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -349,7 +349,7 @@ int migrate_page_move_mapping(struct address_space *mapping, return -EAGAIN; } - if (!page_freeze_refs(page, expected_count)) { + if (!page_ref_freeze(page, expected_count)) { spin_unlock_irq(&mapping->tree_lock); return -EAGAIN; } @@ -363,7 +363,7 @@ int migrate_page_move_mapping(struct address_space *mapping, */ if (mode == MIGRATE_ASYNC && head && !buffer_migrate_lock_buffers(head, mode)) { - page_unfreeze_refs(page, expected_count); + page_ref_unfreeze(page, expected_count); spin_unlock_irq(&mapping->tree_lock); return -EAGAIN; } @@ -397,7 +397,7 @@ int migrate_page_move_mapping(struct address_space *mapping, * to one less reference. * We know this isn't the last reference. */ - page_unfreeze_refs(page, expected_count - 1); + page_ref_unfreeze(page, expected_count - 1); spin_unlock(&mapping->tree_lock); /* Leave irq disabled to prevent preemption while updating stats */ @@ -451,7 +451,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, return -EAGAIN; } - if (!page_freeze_refs(page, expected_count)) { + if (!page_ref_freeze(page, expected_count)) { spin_unlock_irq(&mapping->tree_lock); return -EAGAIN; } @@ -463,7 +463,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, radix_tree_replace_slot(pslot, newpage); - page_unfreeze_refs(page, expected_count - 1); + page_ref_unfreeze(page, expected_count - 1); spin_unlock_irq(&mapping->tree_lock); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 096a00d98a45..30134a8f7cc8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -766,7 +766,7 @@ static inline int free_pages_check(struct page *page) bad_reason = "nonzero mapcount"; if (unlikely(page->mapping != NULL)) bad_reason = "non-NULL mapping"; - if (unlikely(atomic_read(&page->_count) != 0)) + if (unlikely(page_ref_count(page) != 0)) bad_reason = "nonzero _count"; if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) { bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; @@ -1462,7 +1462,7 @@ static inline int check_new_page(struct page *page) bad_reason = "nonzero mapcount"; if (unlikely(page->mapping != NULL)) bad_reason = "non-NULL mapping"; - if (unlikely(atomic_read(&page->_count) != 0)) + if (unlikely(page_ref_count(page) != 0)) bad_reason = "nonzero _count"; if (unlikely(page->flags & __PG_HWPOISON)) { bad_reason = "HWPoisoned (hardware-corrupted)"; @@ -3475,7 +3475,7 @@ refill: /* Even if we own the page, we do not use atomic_set(). * This would break get_page_unless_zero() users. */ - atomic_add(size - 1, &page->_count); + page_ref_add(page, size - 1); /* reset page count bias and offset to start of new frag */ nc->pfmemalloc = page_is_pfmemalloc(page); @@ -3487,7 +3487,7 @@ refill: if (unlikely(offset < 0)) { page = virt_to_page(nc->va); - if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count)) + if (!page_ref_sub_and_test(page, nc->pagecnt_bias)) goto refill; #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) @@ -3495,7 +3495,7 @@ refill: size = nc->size; #endif /* OK, page count is 0, we can safely set it */ - atomic_set(&page->_count, size); + set_page_count(page, size); /* reset page count bias and offset to start of new frag */ nc->pagecnt_bias = size; @@ -6852,7 +6852,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, * This check already skips compound tails of THP * because their page->_count is zero at all time. */ - if (!atomic_read(&page->_count)) { + if (!page_ref_count(page)) { if (PageBuddy(page)) iter += (1 << page_order(page)) - 1; continue; diff --git a/mm/vmscan.c b/mm/vmscan.c index b41b82d4bab1..b934223eaa45 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -638,11 +638,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, * Note that if SetPageDirty is always performed via set_page_dirty, * and thus under tree_lock, then this ordering is not required. */ - if (!page_freeze_refs(page, 2)) + if (!page_ref_freeze(page, 2)) goto cannot_free; /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ if (unlikely(PageDirty(page))) { - page_unfreeze_refs(page, 2); + page_ref_unfreeze(page, 2); goto cannot_free; } @@ -704,7 +704,7 @@ int remove_mapping(struct address_space *mapping, struct page *page) * drops the pagecache ref for us without requiring another * atomic operation. */ - page_unfreeze_refs(page, 1); + page_ref_unfreeze(page, 1); return 1; } return 0; diff --git a/net/core/sock.c b/net/core/sock.c index 6c1c8bc93412..67e7efe12ff7 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1903,7 +1903,7 @@ EXPORT_SYMBOL(sock_cmsg_send); bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) { if (pfrag->page) { - if (atomic_read(&pfrag->page->_count) == 1) { + if (page_ref_count(pfrag->page) == 1) { pfrag->offset = 0; return true; } -- cgit v1.2.3-70-g09d2 From 95813b8faa0cd315f61a8b9d9c523792370b693e Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 17 Mar 2016 14:19:29 -0700 Subject: mm/page_ref: add tracepoint to track down page reference manipulation CMA allocation should be guaranteed to succeed by definition, but, unfortunately, it would be failed sometimes. It is hard to track down the problem, because it is related to page reference manipulation and we don't have any facility to analyze it. This patch adds tracepoints to track down page reference manipulation. With it, we can find exact reason of failure and can fix the problem. Following is an example of tracepoint output. (note: this example is stale version that printing flags as the number. Recent version will print it as human readable string.) <...>-9018 [004] 92.678375: page_ref_set: pfn=0x17ac9 flags=0x0 count=1 mapcount=0 mapping=(nil) mt=4 val=1 <...>-9018 [004] 92.678378: kernel_stack: => get_page_from_freelist (ffffffff81176659) => __alloc_pages_nodemask (ffffffff81176d22) => alloc_pages_vma (ffffffff811bf675) => handle_mm_fault (ffffffff8119e693) => __do_page_fault (ffffffff810631ea) => trace_do_page_fault (ffffffff81063543) => do_async_page_fault (ffffffff8105c40a) => async_page_fault (ffffffff817581d8) [snip] <...>-9018 [004] 92.678379: page_ref_mod: pfn=0x17ac9 flags=0x40048 count=2 mapcount=1 mapping=0xffff880015a78dc1 mt=4 val=1 [snip] ... ... <...>-9131 [001] 93.174468: test_pages_isolated: start_pfn=0x17800 end_pfn=0x17c00 fin_pfn=0x17ac9 ret=fail [snip] <...>-9018 [004] 93.174843: page_ref_mod_and_test: pfn=0x17ac9 flags=0x40068 count=0 mapcount=0 mapping=0xffff880015a78dc1 mt=4 val=-1 ret=1 => release_pages (ffffffff8117c9e4) => free_pages_and_swap_cache (ffffffff811b0697) => tlb_flush_mmu_free (ffffffff81199616) => tlb_finish_mmu (ffffffff8119a62c) => exit_mmap (ffffffff811a53f7) => mmput (ffffffff81073f47) => do_exit (ffffffff810794e9) => do_group_exit (ffffffff81079def) => SyS_exit_group (ffffffff81079e74) => entry_SYSCALL_64_fastpath (ffffffff817560b6) This output shows that problem comes from exit path. In exit path, to improve performance, pages are not freed immediately. They are gathered and processed by batch. During this process, migration cannot be possible and CMA allocation is failed. This problem is hard to find without this page reference tracepoint facility. Enabling this feature bloat kernel text 30 KB in my configuration. text data bss dec hex filename 12127327 2243616 1507328 15878271 f2487f vmlinux_disabled 12157208 2258880 1507328 15923416 f2f8d8 vmlinux_enabled Note that, due to header file dependency problem between mm.h and tracepoint.h, this feature has to open code the static key functions for tracepoints. Proposed by Steven Rostedt in following link. https://lkml.org/lkml/2015/12/9/699 [arnd@arndb.de: crypto/async_pq: use __free_page() instead of put_page()] [iamjoonsoo.kim@lge.com: fix build failure for xtensa] [akpm@linux-foundation.org: tweak Kconfig text, per Vlastimil] Signed-off-by: Joonsoo Kim Acked-by: Michal Nazarewicz Acked-by: Vlastimil Babka Cc: Minchan Kim Cc: Mel Gorman Cc: "Kirill A. Shutemov" Cc: Sergey Senozhatsky Acked-by: Steven Rostedt Signed-off-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- crypto/async_tx/async_pq.c | 2 +- include/linux/page_ref.h | 98 +++++++++++++++++++++++++++-- include/trace/events/page_ref.h | 134 ++++++++++++++++++++++++++++++++++++++++ mm/Kconfig.debug | 13 ++++ mm/Makefile | 1 + mm/debug_page_ref.c | 54 ++++++++++++++++ 6 files changed, 296 insertions(+), 6 deletions(-) create mode 100644 include/trace/events/page_ref.h create mode 100644 mm/debug_page_ref.c (limited to 'include') diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c index c0748bbd4c08..08b3ac68952b 100644 --- a/crypto/async_tx/async_pq.c +++ b/crypto/async_tx/async_pq.c @@ -444,7 +444,7 @@ static int __init async_pq_init(void) static void __exit async_pq_exit(void) { - put_page(pq_scribble_page); + __free_page(pq_scribble_page); } module_init(async_pq_init); diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h index 30f5817f6b8e..e596d5d9540e 100644 --- a/include/linux/page_ref.h +++ b/include/linux/page_ref.h @@ -4,6 +4,62 @@ #include #include #include +#include + +extern struct tracepoint __tracepoint_page_ref_set; +extern struct tracepoint __tracepoint_page_ref_mod; +extern struct tracepoint __tracepoint_page_ref_mod_and_test; +extern struct tracepoint __tracepoint_page_ref_mod_and_return; +extern struct tracepoint __tracepoint_page_ref_mod_unless; +extern struct tracepoint __tracepoint_page_ref_freeze; +extern struct tracepoint __tracepoint_page_ref_unfreeze; + +#ifdef CONFIG_DEBUG_PAGE_REF + +/* + * Ideally we would want to use the trace__enabled() helper + * functions. But due to include header file issues, that is not + * feasible. Instead we have to open code the static key functions. + * + * See trace_##name##_enabled(void) in include/linux/tracepoint.h + */ +#define page_ref_tracepoint_active(t) static_key_false(&(t).key) + +extern void __page_ref_set(struct page *page, int v); +extern void __page_ref_mod(struct page *page, int v); +extern void __page_ref_mod_and_test(struct page *page, int v, int ret); +extern void __page_ref_mod_and_return(struct page *page, int v, int ret); +extern void __page_ref_mod_unless(struct page *page, int v, int u); +extern void __page_ref_freeze(struct page *page, int v, int ret); +extern void __page_ref_unfreeze(struct page *page, int v); + +#else + +#define page_ref_tracepoint_active(t) false + +static inline void __page_ref_set(struct page *page, int v) +{ +} +static inline void __page_ref_mod(struct page *page, int v) +{ +} +static inline void __page_ref_mod_and_test(struct page *page, int v, int ret) +{ +} +static inline void __page_ref_mod_and_return(struct page *page, int v, int ret) +{ +} +static inline void __page_ref_mod_unless(struct page *page, int v, int u) +{ +} +static inline void __page_ref_freeze(struct page *page, int v, int ret) +{ +} +static inline void __page_ref_unfreeze(struct page *page, int v) +{ +} + +#endif static inline int page_ref_count(struct page *page) { @@ -18,6 +74,8 @@ static inline int page_count(struct page *page) static inline void set_page_count(struct page *page, int v) { atomic_set(&page->_count, v); + if (page_ref_tracepoint_active(__tracepoint_page_ref_set)) + __page_ref_set(page, v); } /* @@ -32,46 +90,74 @@ static inline void init_page_count(struct page *page) static inline void page_ref_add(struct page *page, int nr) { atomic_add(nr, &page->_count); + if (page_ref_tracepoint_active(__tracepoint_page_ref_mod)) + __page_ref_mod(page, nr); } static inline void page_ref_sub(struct page *page, int nr) { atomic_sub(nr, &page->_count); + if (page_ref_tracepoint_active(__tracepoint_page_ref_mod)) + __page_ref_mod(page, -nr); } static inline void page_ref_inc(struct page *page) { atomic_inc(&page->_count); + if (page_ref_tracepoint_active(__tracepoint_page_ref_mod)) + __page_ref_mod(page, 1); } static inline void page_ref_dec(struct page *page) { atomic_dec(&page->_count); + if (page_ref_tracepoint_active(__tracepoint_page_ref_mod)) + __page_ref_mod(page, -1); } static inline int page_ref_sub_and_test(struct page *page, int nr) { - return atomic_sub_and_test(nr, &page->_count); + int ret = atomic_sub_and_test(nr, &page->_count); + + if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_test)) + __page_ref_mod_and_test(page, -nr, ret); + return ret; } static inline int page_ref_dec_and_test(struct page *page) { - return atomic_dec_and_test(&page->_count); + int ret = atomic_dec_and_test(&page->_count); + + if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_test)) + __page_ref_mod_and_test(page, -1, ret); + return ret; } static inline int page_ref_dec_return(struct page *page) { - return atomic_dec_return(&page->_count); + int ret = atomic_dec_return(&page->_count); + + if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_return)) + __page_ref_mod_and_return(page, -1, ret); + return ret; } static inline int page_ref_add_unless(struct page *page, int nr, int u) { - return atomic_add_unless(&page->_count, nr, u); + int ret = atomic_add_unless(&page->_count, nr, u); + + if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_unless)) + __page_ref_mod_unless(page, nr, ret); + return ret; } static inline int page_ref_freeze(struct page *page, int count) { - return likely(atomic_cmpxchg(&page->_count, count, 0) == count); + int ret = likely(atomic_cmpxchg(&page->_count, count, 0) == count); + + if (page_ref_tracepoint_active(__tracepoint_page_ref_freeze)) + __page_ref_freeze(page, count, ret); + return ret; } static inline void page_ref_unfreeze(struct page *page, int count) @@ -80,6 +166,8 @@ static inline void page_ref_unfreeze(struct page *page, int count) VM_BUG_ON(count == 0); atomic_set(&page->_count, count); + if (page_ref_tracepoint_active(__tracepoint_page_ref_unfreeze)) + __page_ref_unfreeze(page, count); } #endif diff --git a/include/trace/events/page_ref.h b/include/trace/events/page_ref.h new file mode 100644 index 000000000000..81001f8b0db4 --- /dev/null +++ b/include/trace/events/page_ref.h @@ -0,0 +1,134 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM page_ref + +#if !defined(_TRACE_PAGE_REF_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_PAGE_REF_H + +#include +#include +#include +#include + +DECLARE_EVENT_CLASS(page_ref_mod_template, + + TP_PROTO(struct page *page, int v), + + TP_ARGS(page, v), + + TP_STRUCT__entry( + __field(unsigned long, pfn) + __field(unsigned long, flags) + __field(int, count) + __field(int, mapcount) + __field(void *, mapping) + __field(int, mt) + __field(int, val) + ), + + TP_fast_assign( + __entry->pfn = page_to_pfn(page); + __entry->flags = page->flags; + __entry->count = page_ref_count(page); + __entry->mapcount = page_mapcount(page); + __entry->mapping = page->mapping; + __entry->mt = get_pageblock_migratetype(page); + __entry->val = v; + ), + + TP_printk("pfn=0x%lx flags=%s count=%d mapcount=%d mapping=%p mt=%d val=%d", + __entry->pfn, + show_page_flags(__entry->flags & ((1UL << NR_PAGEFLAGS) - 1)), + __entry->count, + __entry->mapcount, __entry->mapping, __entry->mt, + __entry->val) +); + +DEFINE_EVENT(page_ref_mod_template, page_ref_set, + + TP_PROTO(struct page *page, int v), + + TP_ARGS(page, v) +); + +DEFINE_EVENT(page_ref_mod_template, page_ref_mod, + + TP_PROTO(struct page *page, int v), + + TP_ARGS(page, v) +); + +DECLARE_EVENT_CLASS(page_ref_mod_and_test_template, + + TP_PROTO(struct page *page, int v, int ret), + + TP_ARGS(page, v, ret), + + TP_STRUCT__entry( + __field(unsigned long, pfn) + __field(unsigned long, flags) + __field(int, count) + __field(int, mapcount) + __field(void *, mapping) + __field(int, mt) + __field(int, val) + __field(int, ret) + ), + + TP_fast_assign( + __entry->pfn = page_to_pfn(page); + __entry->flags = page->flags; + __entry->count = page_ref_count(page); + __entry->mapcount = page_mapcount(page); + __entry->mapping = page->mapping; + __entry->mt = get_pageblock_migratetype(page); + __entry->val = v; + __entry->ret = ret; + ), + + TP_printk("pfn=0x%lx flags=%s count=%d mapcount=%d mapping=%p mt=%d val=%d ret=%d", + __entry->pfn, + show_page_flags(__entry->flags & ((1UL << NR_PAGEFLAGS) - 1)), + __entry->count, + __entry->mapcount, __entry->mapping, __entry->mt, + __entry->val, __entry->ret) +); + +DEFINE_EVENT(page_ref_mod_and_test_template, page_ref_mod_and_test, + + TP_PROTO(struct page *page, int v, int ret), + + TP_ARGS(page, v, ret) +); + +DEFINE_EVENT(page_ref_mod_and_test_template, page_ref_mod_and_return, + + TP_PROTO(struct page *page, int v, int ret), + + TP_ARGS(page, v, ret) +); + +DEFINE_EVENT(page_ref_mod_and_test_template, page_ref_mod_unless, + + TP_PROTO(struct page *page, int v, int ret), + + TP_ARGS(page, v, ret) +); + +DEFINE_EVENT(page_ref_mod_and_test_template, page_ref_freeze, + + TP_PROTO(struct page *page, int v, int ret), + + TP_ARGS(page, v, ret) +); + +DEFINE_EVENT(page_ref_mod_template, page_ref_unfreeze, + + TP_PROTO(struct page *page, int v), + + TP_ARGS(page, v) +); + +#endif /* _TRACE_PAGE_COUNT_H */ + +/* This part must be outside protection */ +#include diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 5c50b238b770..22f4cd96acb0 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -79,3 +79,16 @@ config PAGE_POISONING_ZERO Enabling page poisoning with this option will disable hibernation If unsure, say N + bool + +config DEBUG_PAGE_REF + bool "Enable tracepoint to track down page reference manipulation" + depends on DEBUG_KERNEL + depends on TRACEPOINTS + ---help--- + This is a feature to add tracepoint for tracking down page reference + manipulation. This tracking is useful to diagnose functional failure + due to migration failures caused by page reference mismatches. Be + careful when enabling this feature because it adds about 30 KB to the + kernel code. However the runtime performance overhead is virtually + nil until the tracepoints are actually enabled. diff --git a/mm/Makefile b/mm/Makefile index cfdd481d27a5..6da300a1414b 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -81,3 +81,4 @@ obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o +obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o diff --git a/mm/debug_page_ref.c b/mm/debug_page_ref.c new file mode 100644 index 000000000000..1aef3d562e52 --- /dev/null +++ b/mm/debug_page_ref.c @@ -0,0 +1,54 @@ +#include +#include + +#define CREATE_TRACE_POINTS +#include + +void __page_ref_set(struct page *page, int v) +{ + trace_page_ref_set(page, v); +} +EXPORT_SYMBOL(__page_ref_set); +EXPORT_TRACEPOINT_SYMBOL(page_ref_set); + +void __page_ref_mod(struct page *page, int v) +{ + trace_page_ref_mod(page, v); +} +EXPORT_SYMBOL(__page_ref_mod); +EXPORT_TRACEPOINT_SYMBOL(page_ref_mod); + +void __page_ref_mod_and_test(struct page *page, int v, int ret) +{ + trace_page_ref_mod_and_test(page, v, ret); +} +EXPORT_SYMBOL(__page_ref_mod_and_test); +EXPORT_TRACEPOINT_SYMBOL(page_ref_mod_and_test); + +void __page_ref_mod_and_return(struct page *page, int v, int ret) +{ + trace_page_ref_mod_and_return(page, v, ret); +} +EXPORT_SYMBOL(__page_ref_mod_and_return); +EXPORT_TRACEPOINT_SYMBOL(page_ref_mod_and_return); + +void __page_ref_mod_unless(struct page *page, int v, int u) +{ + trace_page_ref_mod_unless(page, v, u); +} +EXPORT_SYMBOL(__page_ref_mod_unless); +EXPORT_TRACEPOINT_SYMBOL(page_ref_mod_unless); + +void __page_ref_freeze(struct page *page, int v, int ret) +{ + trace_page_ref_freeze(page, v, ret); +} +EXPORT_SYMBOL(__page_ref_freeze); +EXPORT_TRACEPOINT_SYMBOL(page_ref_freeze); + +void __page_ref_unfreeze(struct page *page, int v) +{ + trace_page_ref_unfreeze(page, v); +} +EXPORT_SYMBOL(__page_ref_unfreeze); +EXPORT_TRACEPOINT_SYMBOL(page_ref_unfreeze); -- cgit v1.2.3-70-g09d2 From b11a7b94100cba5ec926a181894c2897a22651b9 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 17 Mar 2016 14:19:41 -0700 Subject: mm: exclude ZONE_DEVICE from GFP_ZONE_TABLE ZONE_DEVICE (merged in 4.3) and ZONE_CMA (proposed) are examples of new mm zones that are bumping up against the current maximum limit of 4 zones, i.e. 2 bits in page->flags for the GFP_ZONE_TABLE. The GFP_ZONE_TABLE poses an interesting constraint since include/linux/gfp.h gets included by the 32-bit portion of a 64-bit build. We need to be careful to only build the table for zones that have a corresponding gfp_t flag. GFP_ZONES_SHIFT is introduced for this purpose. This patch does not attempt to solve the problem of adding a new zone that also has a corresponding GFP_ flag. Vlastimil points out that ZONE_DEVICE, by depending on x86_64 and SPARSEMEM_VMEMMAP implies that SECTIONS_WIDTH is zero. In other words even though ZONE_DEVICE does not fit in GFP_ZONE_TABLE it is free to consume another bit in page->flags (expand ZONES_WIDTH) with room to spare. Link: https://bugzilla.kernel.org/show_bug.cgi?id=110931 Fixes: 033fbae988fc ("mm: ZONE_DEVICE for "device memory"") Signed-off-by: Dan Williams Reported-by: Mark Reported-by: Vlastimil Babka Cc: Mel Gorman Cc: Rik van Riel Cc: Joonsoo Kim Cc: Dave Hansen Cc: Sudip Mukherjee Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 33 ++++++++++++++++++++------------- include/linux/page-flags-layout.h | 2 ++ mm/Kconfig | 2 -- 3 files changed, 22 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 11d56c6e7ef2..570383a41853 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -331,22 +331,29 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) * 0xe => BAD (MOVABLE+DMA32+HIGHMEM) * 0xf => BAD (MOVABLE+DMA32+HIGHMEM+DMA) * - * ZONES_SHIFT must be <= 2 on 32 bit platforms. + * GFP_ZONES_SHIFT must be <= 2 on 32 bit platforms. */ -#if 16 * ZONES_SHIFT > BITS_PER_LONG -#error ZONES_SHIFT too large to create GFP_ZONE_TABLE integer +#if defined(CONFIG_ZONE_DEVICE) && (MAX_NR_ZONES-1) <= 4 +/* ZONE_DEVICE is not a valid GFP zone specifier */ +#define GFP_ZONES_SHIFT 2 +#else +#define GFP_ZONES_SHIFT ZONES_SHIFT +#endif + +#if 16 * GFP_ZONES_SHIFT > BITS_PER_LONG +#error GFP_ZONES_SHIFT too large to create GFP_ZONE_TABLE integer #endif #define GFP_ZONE_TABLE ( \ - (ZONE_NORMAL << 0 * ZONES_SHIFT) \ - | (OPT_ZONE_DMA << ___GFP_DMA * ZONES_SHIFT) \ - | (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * ZONES_SHIFT) \ - | (OPT_ZONE_DMA32 << ___GFP_DMA32 * ZONES_SHIFT) \ - | (ZONE_NORMAL << ___GFP_MOVABLE * ZONES_SHIFT) \ - | (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * ZONES_SHIFT) \ - | (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * ZONES_SHIFT) \ - | (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * ZONES_SHIFT) \ + (ZONE_NORMAL << 0 * GFP_ZONES_SHIFT) \ + | (OPT_ZONE_DMA << ___GFP_DMA * GFP_ZONES_SHIFT) \ + | (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * GFP_ZONES_SHIFT) \ + | (OPT_ZONE_DMA32 << ___GFP_DMA32 * GFP_ZONES_SHIFT) \ + | (ZONE_NORMAL << ___GFP_MOVABLE * GFP_ZONES_SHIFT) \ + | (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * GFP_ZONES_SHIFT) \ + | (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * GFP_ZONES_SHIFT)\ + | (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * GFP_ZONES_SHIFT)\ ) /* @@ -371,8 +378,8 @@ static inline enum zone_type gfp_zone(gfp_t flags) enum zone_type z; int bit = (__force int) (flags & GFP_ZONEMASK); - z = (GFP_ZONE_TABLE >> (bit * ZONES_SHIFT)) & - ((1 << ZONES_SHIFT) - 1); + z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) & + ((1 << GFP_ZONES_SHIFT) - 1); VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1); return z; } diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h index da523661500a..77b078c103b2 100644 --- a/include/linux/page-flags-layout.h +++ b/include/linux/page-flags-layout.h @@ -17,6 +17,8 @@ #define ZONES_SHIFT 1 #elif MAX_NR_ZONES <= 4 #define ZONES_SHIFT 2 +#elif MAX_NR_ZONES <= 8 +#define ZONES_SHIFT 3 #else #error ZONES_SHIFT -- too many zones configured adjust calculation #endif diff --git a/mm/Kconfig b/mm/Kconfig index c07776503708..520dae6e6ed0 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -651,8 +651,6 @@ config IDLE_PAGE_TRACKING config ZONE_DEVICE bool "Device memory (pmem, etc...) hotplug support" if EXPERT - default !ZONE_DMA - depends on !ZONE_DMA depends on MEMORY_HOTPLUG depends on MEMORY_HOTREMOVE depends on X86_64 #arch_add_memory() comprehends device memory -- cgit v1.2.3-70-g09d2 From 0e8fb9312fbaf1a687dd731b04d8ab3121c4ff5a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 17 Mar 2016 14:19:55 -0700 Subject: mm: remove VM_FAULT_MINOR The define has a comment from Nick Piggin from 2007: /* For backwards compat. Remove me quickly. */ I guess 9 years should not be too hurried sense of 'quickly' even for kernel measures. Signed-off-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/fault.c | 2 +- arch/arm64/mm/fault.c | 2 +- arch/unicore32/mm/fault.c | 2 +- arch/xtensa/mm/fault.c | 2 +- include/linux/mm.h | 2 -- 5 files changed, 4 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index daafcf121ce0..ad5841856007 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -346,7 +346,7 @@ retry: up_read(&mm->mmap_sem); /* - * Handle the "normal" case first - VM_FAULT_MAJOR / VM_FAULT_MINOR + * Handle the "normal" case first - VM_FAULT_MAJOR */ if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS)))) return 0; diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index abe2a9542b3a..97135b61b32a 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -295,7 +295,7 @@ retry: up_read(&mm->mmap_sem); /* - * Handle the "normal" case first - VM_FAULT_MAJOR / VM_FAULT_MINOR + * Handle the "normal" case first - VM_FAULT_MAJOR */ if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS)))) diff --git a/arch/unicore32/mm/fault.c b/arch/unicore32/mm/fault.c index afccef5529cc..2ec3d3adcefc 100644 --- a/arch/unicore32/mm/fault.c +++ b/arch/unicore32/mm/fault.c @@ -276,7 +276,7 @@ retry: up_read(&mm->mmap_sem); /* - * Handle the "normal" case first - VM_FAULT_MAJOR / VM_FAULT_MINOR + * Handle the "normal" case first - VM_FAULT_MAJOR */ if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS)))) diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c index c9784c1b18d8..7f4a1fdb1502 100644 --- a/arch/xtensa/mm/fault.c +++ b/arch/xtensa/mm/fault.c @@ -146,7 +146,7 @@ good_area: perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); if (flags & VM_FAULT_MAJOR) perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); - else if (flags & VM_FAULT_MINOR) + else perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); return; diff --git a/include/linux/mm.h b/include/linux/mm.h index 997fc2e5d9d8..7d42501c8bb4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1052,8 +1052,6 @@ static inline void clear_page_pfmemalloc(struct page *page) * just gets major/minor fault counters bumped up. */ -#define VM_FAULT_MINOR 0 /* For backwards compat. Remove me quickly. */ - #define VM_FAULT_OOM 0x0001 #define VM_FAULT_SIGBUS 0x0002 #define VM_FAULT_MAJOR 0x0004 -- cgit v1.2.3-70-g09d2 From b97731992d00f09456726bfc5ab6641c07773038 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 17 Mar 2016 14:20:01 -0700 Subject: rmap: introduce rmap_walk_locked() This patchset rewrites freeze_page() and unfreeze_page() using try_to_unmap() and remove_migration_ptes(). Result is much simpler, but somewhat slower. Migration 8GiB worth of PMD-mapped THP: Baseline 20.21 +/- 0.393 Patched 20.73 +/- 0.082 Slowdown 1.03x It's 3% slower, comparing to 14% in v1. I don't it should be a stopper. Splitting of PTE-mapped pages slowed more. But this is not a common case. Migration 8GiB worth of PMD-mapped THP: Baseline 20.39 +/- 0.225 Patched 22.43 +/- 0.496 Slowdown 1.10x rmap_walk_locked() is the same as rmap_walk(), but the caller takes care of the relevant rmap lock. This is preparation for switching THP splitting from custom rmap walk in freeze_page()/unfreeze_page() to the generic one. There is no support for KSM pages for now: not clear which lock is implied. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 1 + mm/rmap.c | 41 ++++++++++++++++++++++++++++++++--------- 2 files changed, 33 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index a07f42bedda3..a5875e9b4a27 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -266,6 +266,7 @@ struct rmap_walk_control { }; int rmap_walk(struct page *page, struct rmap_walk_control *rwc); +int rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc); #else /* !CONFIG_MMU */ diff --git a/mm/rmap.c b/mm/rmap.c index 02f0bfc3c80a..30b739ce0ffa 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1715,14 +1715,21 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, * vm_flags for that VMA. That should be OK, because that vma shouldn't be * LOCKED. */ -static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) +static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, + bool locked) { struct anon_vma *anon_vma; pgoff_t pgoff; struct anon_vma_chain *avc; int ret = SWAP_AGAIN; - anon_vma = rmap_walk_anon_lock(page, rwc); + if (locked) { + anon_vma = page_anon_vma(page); + /* anon_vma disappear under us? */ + VM_BUG_ON_PAGE(!anon_vma, page); + } else { + anon_vma = rmap_walk_anon_lock(page, rwc); + } if (!anon_vma) return ret; @@ -1742,7 +1749,9 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) if (rwc->done && rwc->done(page)) break; } - anon_vma_unlock_read(anon_vma); + + if (!locked) + anon_vma_unlock_read(anon_vma); return ret; } @@ -1759,9 +1768,10 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) * vm_flags for that VMA. That should be OK, because that vma shouldn't be * LOCKED. */ -static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) +static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, + bool locked) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); pgoff_t pgoff; struct vm_area_struct *vma; int ret = SWAP_AGAIN; @@ -1778,7 +1788,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) return ret; pgoff = page_to_pgoff(page); - i_mmap_lock_read(mapping); + if (!locked) + i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); @@ -1795,7 +1806,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) } done: - i_mmap_unlock_read(mapping); + if (!locked) + i_mmap_unlock_read(mapping); return ret; } @@ -1804,9 +1816,20 @@ int rmap_walk(struct page *page, struct rmap_walk_control *rwc) if (unlikely(PageKsm(page))) return rmap_walk_ksm(page, rwc); else if (PageAnon(page)) - return rmap_walk_anon(page, rwc); + return rmap_walk_anon(page, rwc, false); + else + return rmap_walk_file(page, rwc, false); +} + +/* Like rmap_walk, but caller holds relevant rmap lock */ +int rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc) +{ + /* no ksm support for now */ + VM_BUG_ON_PAGE(PageKsm(page), page); + if (PageAnon(page)) + return rmap_walk_anon(page, rwc, true); else - return rmap_walk_file(page, rwc); + return rmap_walk_file(page, rwc, true); } #ifdef CONFIG_HUGETLB_PAGE -- cgit v1.2.3-70-g09d2 From 2a52bcbcc688eecead2953143f7ef695b8e44575 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 17 Mar 2016 14:20:04 -0700 Subject: rmap: extend try_to_unmap() to be usable by split_huge_page() Add support for two ttu_flags: - TTU_SPLIT_HUGE_PMD would split PMD if it's there, before trying to unmap page; - TTU_RMAP_LOCKED indicates that caller holds relevant rmap lock; Also, change rwc->done to !page_mapcount() instead of !page_mapped(). try_to_unmap() works on pte level, so we are really interested in the mappedness of this small page rather than of the compound page it's a part of. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 7 +++++++ include/linux/rmap.h | 3 +++ mm/huge_memory.c | 5 +---- mm/rmap.c | 24 ++++++++++++++++-------- 4 files changed, 27 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index a4cecb4801ec..01ad22e938b0 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -106,6 +106,9 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, __split_huge_pmd(__vma, __pmd, __address); \ } while (0) + +void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address); + #if HPAGE_PMD_ORDER >= MAX_ORDER #error "hugepages can't be allocated by the buddy allocator" #endif @@ -173,6 +176,10 @@ static inline int split_huge_page(struct page *page) static inline void deferred_split_huge_page(struct page *page) {} #define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) + +static inline void split_huge_pmd_address(struct vm_area_struct *vma, + unsigned long address) {} + static inline int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) { diff --git a/include/linux/rmap.h b/include/linux/rmap.h index a5875e9b4a27..3d975e2252d4 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -86,6 +86,7 @@ enum ttu_flags { TTU_MIGRATION = 2, /* migration mode */ TTU_MUNLOCK = 4, /* munlock mode */ TTU_LZFREE = 8, /* lazy free mode */ + TTU_SPLIT_HUGE_PMD = 16, /* split huge PMD if any */ TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */ TTU_IGNORE_ACCESS = (1 << 9), /* don't age */ @@ -93,6 +94,8 @@ enum ttu_flags { TTU_BATCH_FLUSH = (1 << 11), /* Batch TLB flushes where possible * and caller guarantees they will * do a final flush if necessary */ + TTU_RMAP_LOCKED = (1 << 12) /* do not grab rmap lock: + * caller holds it */ }; #ifdef CONFIG_MMU diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e1a177c20791..11d15674ff38 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3006,15 +3006,12 @@ out: } } -static void split_huge_pmd_address(struct vm_area_struct *vma, - unsigned long address) +void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; - VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); - pgd = pgd_offset(vma->vm_mm, address); if (!pgd_present(*pgd)) return; diff --git a/mm/rmap.c b/mm/rmap.c index 30b739ce0ffa..945933a01010 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1431,6 +1431,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) goto out; + if (flags & TTU_SPLIT_HUGE_PMD) + split_huge_pmd_address(vma, address); pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) goto out; @@ -1576,10 +1578,10 @@ static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) return is_vma_temporary_stack(vma); } -static int page_not_mapped(struct page *page) +static int page_mapcount_is_zero(struct page *page) { - return !page_mapped(page); -}; + return !page_mapcount(page); +} /** * try_to_unmap - try to remove all page table mappings to a page @@ -1606,12 +1608,10 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, .arg = &rp, - .done = page_not_mapped, + .done = page_mapcount_is_zero, .anon_lock = page_lock_anon_vma_read, }; - VM_BUG_ON_PAGE(!PageHuge(page) && PageTransHuge(page), page); - /* * During exec, a temporary VMA is setup and later moved. * The VMA is moved under the anon_vma lock but not the @@ -1623,9 +1623,12 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) if ((flags & TTU_MIGRATION) && !PageKsm(page) && PageAnon(page)) rwc.invalid_vma = invalid_migration_vma; - ret = rmap_walk(page, &rwc); + if (flags & TTU_RMAP_LOCKED) + ret = rmap_walk_locked(page, &rwc); + else + ret = rmap_walk(page, &rwc); - if (ret != SWAP_MLOCK && !page_mapped(page)) { + if (ret != SWAP_MLOCK && !page_mapcount(page)) { ret = SWAP_SUCCESS; if (rp.lazyfreed && !PageDirty(page)) ret = SWAP_LZFREE; @@ -1633,6 +1636,11 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) return ret; } +static int page_not_mapped(struct page *page) +{ + return !page_mapped(page); +}; + /** * try_to_munlock - try to munlock a page * @page: the page to be munlocked -- cgit v1.2.3-70-g09d2 From e388466de4a2a1a50c43bfaeacc0c8254d9e7cb2 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 17 Mar 2016 14:20:07 -0700 Subject: mm: make remove_migration_ptes() beyond mm/migration.c Make remove_migration_ptes() available to be used in split_huge_page(). New parameter 'locked' added: as with try_to_umap() we need a way to indicate that caller holds rmap lock. We also shouldn't try to mlock() pte-mapped huge pages: pte-mapeed THP pages are never mlocked. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 2 ++ mm/migrate.c | 15 +++++++++------ 2 files changed, 11 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 3d975e2252d4..49eb4f8ebac9 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -243,6 +243,8 @@ int page_mkclean(struct page *); */ int try_to_munlock(struct page *); +void remove_migration_ptes(struct page *old, struct page *new, bool locked); + /* * Called by memory-failure.c to kill processes. */ diff --git a/mm/migrate.c b/mm/migrate.c index 577c94b8e959..6c822a7b27e0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -172,7 +172,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, else page_add_file_rmap(new); - if (vma->vm_flags & VM_LOCKED) + if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) mlock_vma_page(new); /* No need to invalidate - it was non-present before */ @@ -187,14 +187,17 @@ out: * Get rid of all migration entries and replace them by * references to the indicated page. */ -static void remove_migration_ptes(struct page *old, struct page *new) +void remove_migration_ptes(struct page *old, struct page *new, bool locked) { struct rmap_walk_control rwc = { .rmap_one = remove_migration_pte, .arg = old, }; - rmap_walk(new, &rwc); + if (locked) + rmap_walk_locked(new, &rwc); + else + rmap_walk(new, &rwc); } /* @@ -702,7 +705,7 @@ static int writeout(struct address_space *mapping, struct page *page) * At this point we know that the migration attempt cannot * be successful. */ - remove_migration_ptes(page, page); + remove_migration_ptes(page, page, false); rc = mapping->a_ops->writepage(page, &wbc); @@ -900,7 +903,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, if (page_was_mapped) remove_migration_ptes(page, - rc == MIGRATEPAGE_SUCCESS ? newpage : page); + rc == MIGRATEPAGE_SUCCESS ? newpage : page, false); out_unlock_both: unlock_page(newpage); @@ -1070,7 +1073,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (page_was_mapped) remove_migration_ptes(hpage, - rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage); + rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false); unlock_page(new_hpage); -- cgit v1.2.3-70-g09d2 From fec89c109f3a7737fe3a7bf0f40d1fb7709d353b Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 17 Mar 2016 14:20:10 -0700 Subject: thp: rewrite freeze_page()/unfreeze_page() with generic rmap walkers freeze_page() and unfreeze_page() helpers evolved in rather complex beasts. It would be nice to cut complexity of this code. This patch rewrites freeze_page() using standard try_to_unmap(). unfreeze_page() is rewritten with remove_migration_ptes(). The result is much simpler. But the new variant is somewhat slower for PTE-mapped THPs. Current helpers iterates over VMAs the compound page is mapped to, and then over ptes within this VMA. New helpers iterates over small page, then over VMA the small page mapped to, and only then find relevant pte. We have short cut for PMD-mapped THP: we directly install migration entries on PMD split. I don't think the slowdown is critical, considering how much simpler result is and that split_huge_page() is quite rare nowadays. It only happens due memory pressure or migration. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 10 ++- mm/huge_memory.c | 210 +++++++++--------------------------------------- mm/rmap.c | 10 ++- 3 files changed, 50 insertions(+), 180 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 01ad22e938b0..5307dfb3f8ec 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -96,18 +96,20 @@ static inline int split_huge_page(struct page *page) void deferred_split_huge_page(struct page *page); void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long address); + unsigned long address, bool freeze); #define split_huge_pmd(__vma, __pmd, __address) \ do { \ pmd_t *____pmd = (__pmd); \ if (pmd_trans_huge(*____pmd) \ || pmd_devmap(*____pmd)) \ - __split_huge_pmd(__vma, __pmd, __address); \ + __split_huge_pmd(__vma, __pmd, __address, \ + false); \ } while (0) -void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address); +void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, + bool freeze, struct page *page); #if HPAGE_PMD_ORDER >= MAX_ORDER #error "hugepages can't be allocated by the buddy allocator" @@ -178,7 +180,7 @@ static inline void deferred_split_huge_page(struct page *page) {} do { } while (0) static inline void split_huge_pmd_address(struct vm_area_struct *vma, - unsigned long address) {} + unsigned long address, bool freeze, struct page *page) {} static inline int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 11d15674ff38..4a58fa19afac 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2977,7 +2977,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, } void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long address) + unsigned long address, bool freeze) { spinlock_t *ptl; struct mm_struct *mm = vma->vm_mm; @@ -2994,7 +2994,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, page = NULL; } else if (!pmd_devmap(*pmd)) goto out; - __split_huge_pmd_locked(vma, pmd, haddr, false); + __split_huge_pmd_locked(vma, pmd, haddr, freeze); out: spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE); @@ -3006,7 +3006,8 @@ out: } } -void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address) +void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, + bool freeze, struct page *page) { pgd_t *pgd; pud_t *pud; @@ -3023,11 +3024,20 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address) pmd = pmd_offset(pud, address); if (!pmd_present(*pmd) || (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd))) return; + + /* + * If caller asks to setup a migration entries, we need a page to check + * pmd against. Otherwise we can end up replacing wrong page. + */ + VM_BUG_ON(freeze && !page); + if (page && page != pmd_page(*pmd)) + return; + /* * Caller holds the mmap_sem write mode, so a huge pmd cannot * materialize from under us. */ - split_huge_pmd(vma, pmd, address); + __split_huge_pmd(vma, pmd, address, freeze); } void vma_adjust_trans_huge(struct vm_area_struct *vma, @@ -3043,7 +3053,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, if (start & ~HPAGE_PMD_MASK && (start & HPAGE_PMD_MASK) >= vma->vm_start && (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) - split_huge_pmd_address(vma, start); + split_huge_pmd_address(vma, start, false, NULL); /* * If the new end address isn't hpage aligned and it could @@ -3053,7 +3063,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, if (end & ~HPAGE_PMD_MASK && (end & HPAGE_PMD_MASK) >= vma->vm_start && (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) - split_huge_pmd_address(vma, end); + split_huge_pmd_address(vma, end, false, NULL); /* * If we're also updating the vma->vm_next->vm_start, if the new @@ -3067,184 +3077,36 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, if (nstart & ~HPAGE_PMD_MASK && (nstart & HPAGE_PMD_MASK) >= next->vm_start && (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) - split_huge_pmd_address(next, nstart); + split_huge_pmd_address(next, nstart, false, NULL); } } -static void freeze_page_vma(struct vm_area_struct *vma, struct page *page, - unsigned long address) +static void freeze_page(struct page *page) { - unsigned long haddr = address & HPAGE_PMD_MASK; - spinlock_t *ptl; - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - int i, nr = HPAGE_PMD_NR; - - /* Skip pages which doesn't belong to the VMA */ - if (address < vma->vm_start) { - int off = (vma->vm_start - address) >> PAGE_SHIFT; - page += off; - nr -= off; - address = vma->vm_start; - } - - pgd = pgd_offset(vma->vm_mm, address); - if (!pgd_present(*pgd)) - return; - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - return; - pmd = pmd_offset(pud, address); - ptl = pmd_lock(vma->vm_mm, pmd); - if (!pmd_present(*pmd)) { - spin_unlock(ptl); - return; - } - if (pmd_trans_huge(*pmd)) { - if (page == pmd_page(*pmd)) - __split_huge_pmd_locked(vma, pmd, haddr, true); - spin_unlock(ptl); - return; - } - spin_unlock(ptl); - - pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl); - for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) { - pte_t entry, swp_pte; - swp_entry_t swp_entry; - - /* - * We've just crossed page table boundary: need to map next one. - * It can happen if THP was mremaped to non PMD-aligned address. - */ - if (unlikely(address == haddr + HPAGE_PMD_SIZE)) { - pte_unmap_unlock(pte - 1, ptl); - pmd = mm_find_pmd(vma->vm_mm, address); - if (!pmd) - return; - pte = pte_offset_map_lock(vma->vm_mm, pmd, - address, &ptl); - } - - if (!pte_present(*pte)) - continue; - if (page_to_pfn(page) != pte_pfn(*pte)) - continue; - flush_cache_page(vma, address, page_to_pfn(page)); - entry = ptep_clear_flush(vma, address, pte); - if (pte_dirty(entry)) - SetPageDirty(page); - swp_entry = make_migration_entry(page, pte_write(entry)); - swp_pte = swp_entry_to_pte(swp_entry); - if (pte_soft_dirty(entry)) - swp_pte = pte_swp_mksoft_dirty(swp_pte); - set_pte_at(vma->vm_mm, address, pte, swp_pte); - page_remove_rmap(page, false); - put_page(page); - } - pte_unmap_unlock(pte - 1, ptl); -} - -static void freeze_page(struct anon_vma *anon_vma, struct page *page) -{ - struct anon_vma_chain *avc; - pgoff_t pgoff = page_to_pgoff(page); + enum ttu_flags ttu_flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | + TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED; + int i, ret; VM_BUG_ON_PAGE(!PageHead(page), page); - anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, - pgoff + HPAGE_PMD_NR - 1) { - unsigned long address = __vma_address(page, avc->vma); - - mmu_notifier_invalidate_range_start(avc->vma->vm_mm, - address, address + HPAGE_PMD_SIZE); - freeze_page_vma(avc->vma, page, address); - mmu_notifier_invalidate_range_end(avc->vma->vm_mm, - address, address + HPAGE_PMD_SIZE); - } -} - -static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page, - unsigned long address) -{ - spinlock_t *ptl; - pmd_t *pmd; - pte_t *pte, entry; - swp_entry_t swp_entry; - unsigned long haddr = address & HPAGE_PMD_MASK; - int i, nr = HPAGE_PMD_NR; - - /* Skip pages which doesn't belong to the VMA */ - if (address < vma->vm_start) { - int off = (vma->vm_start - address) >> PAGE_SHIFT; - page += off; - nr -= off; - address = vma->vm_start; - } - - pmd = mm_find_pmd(vma->vm_mm, address); - if (!pmd) - return; - - pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl); - for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) { - /* - * We've just crossed page table boundary: need to map next one. - * It can happen if THP was mremaped to non-PMD aligned address. - */ - if (unlikely(address == haddr + HPAGE_PMD_SIZE)) { - pte_unmap_unlock(pte - 1, ptl); - pmd = mm_find_pmd(vma->vm_mm, address); - if (!pmd) - return; - pte = pte_offset_map_lock(vma->vm_mm, pmd, - address, &ptl); - } - - if (!is_swap_pte(*pte)) - continue; - - swp_entry = pte_to_swp_entry(*pte); - if (!is_migration_entry(swp_entry)) - continue; - if (migration_entry_to_page(swp_entry) != page) - continue; - - get_page(page); - page_add_anon_rmap(page, vma, address, false); - - entry = pte_mkold(mk_pte(page, vma->vm_page_prot)); - if (PageDirty(page)) - entry = pte_mkdirty(entry); - if (is_write_migration_entry(swp_entry)) - entry = maybe_mkwrite(entry, vma); - - flush_dcache_page(page); - set_pte_at(vma->vm_mm, address, pte, entry); + /* We only need TTU_SPLIT_HUGE_PMD once */ + ret = try_to_unmap(page, ttu_flags | TTU_SPLIT_HUGE_PMD); + for (i = 1; !ret && i < HPAGE_PMD_NR; i++) { + /* Cut short if the page is unmapped */ + if (page_count(page) == 1) + return; - /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, address, pte); + ret = try_to_unmap(page + i, ttu_flags); } - pte_unmap_unlock(pte - 1, ptl); + VM_BUG_ON(ret); } -static void unfreeze_page(struct anon_vma *anon_vma, struct page *page) +static void unfreeze_page(struct page *page) { - struct anon_vma_chain *avc; - pgoff_t pgoff = page_to_pgoff(page); - - anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, - pgoff, pgoff + HPAGE_PMD_NR - 1) { - unsigned long address = __vma_address(page, avc->vma); + int i; - mmu_notifier_invalidate_range_start(avc->vma->vm_mm, - address, address + HPAGE_PMD_SIZE); - unfreeze_page_vma(avc->vma, page, address); - mmu_notifier_invalidate_range_end(avc->vma->vm_mm, - address, address + HPAGE_PMD_SIZE); - } + for (i = 0; i < HPAGE_PMD_NR; i++) + remove_migration_ptes(page + i, page + i, true); } static void __split_huge_page_tail(struct page *head, int tail, @@ -3322,7 +3184,7 @@ static void __split_huge_page(struct page *page, struct list_head *list) ClearPageCompound(head); spin_unlock_irq(&zone->lru_lock); - unfreeze_page(page_anon_vma(head), head); + unfreeze_page(head); for (i = 0; i < HPAGE_PMD_NR; i++) { struct page *subpage = head + i; @@ -3418,7 +3280,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } mlocked = PageMlocked(page); - freeze_page(anon_vma, head); + freeze_page(head); VM_BUG_ON_PAGE(compound_mapcount(head), head); /* Make sure the page is not on per-CPU pagevec as it takes pin */ @@ -3447,7 +3309,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) BUG(); } else { spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); - unfreeze_page(anon_vma, head); + unfreeze_page(head); ret = -EBUSY; } diff --git a/mm/rmap.c b/mm/rmap.c index 945933a01010..c399a0d41b31 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1431,8 +1431,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) goto out; - if (flags & TTU_SPLIT_HUGE_PMD) - split_huge_pmd_address(vma, address); + if (flags & TTU_SPLIT_HUGE_PMD) { + split_huge_pmd_address(vma, address, + flags & TTU_MIGRATION, page); + /* check if we have anything to do after split */ + if (page_mapcount(page) == 0) + goto out; + } + pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) goto out; -- cgit v1.2.3-70-g09d2 From da8b44d5a9f8bf26da637b7336508ca534d6b319 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 17 Mar 2016 14:20:51 -0700 Subject: timer: convert timer_slack_ns from unsigned long to u64 This patchset introduces a /proc//timerslack_ns interface which would allow controlling processes to be able to set the timerslack value on other processes in order to save power by avoiding wakeups (Something Android currently does via out-of-tree patches). The first patch tries to fix the internal timer_slack_ns usage which was defined as a long, which limits the slack range to ~4 seconds on 32bit systems. It converts it to a u64, which provides the same basically unlimited slack (500 years) on both 32bit and 64bit machines. The second patch introduces the /proc//timerslack_ns interface which allows the full 64bit slack range for a task to be read or set on both 32bit and 64bit machines. With these two patches, on a 32bit machine, after setting the slack on bash to 10 seconds: $ time sleep 1 real 0m10.747s user 0m0.001s sys 0m0.005s The first patch is a little ugly, since I had to chase the slack delta arguments through a number of functions converting them to u64s. Let me know if it makes sense to break that up more or not. Other than that things are fairly straightforward. This patch (of 2): The timer_slack_ns value in the task struct is currently a unsigned long. This means that on 32bit applications, the maximum slack is just over 4 seconds. However, on 64bit machines, its much much larger (~500 years). This disparity could make application development a little (as well as the default_slack) to a u64. This means both 32bit and 64bit systems have the same effective internal slack range. Now the existing ABI via PR_GET_TIMERSLACK and PR_SET_TIMERSLACK specify the interface as a unsigned long, so we preserve that limitation on 32bit systems, where SET_TIMERSLACK can only set the slack to a unsigned long value, and GET_TIMERSLACK will return ULONG_MAX if the slack is actually larger then what can be stored by an unsigned long. This patch also modifies hrtimer functions which specified the slack delta as a unsigned long. Signed-off-by: John Stultz Cc: Arjan van de Ven Cc: Thomas Gleixner Cc: Oren Laadan Cc: Ruchi Kandoi Cc: Rom Lemarchand Cc: Kees Cook Cc: Android Kernel Team Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 2 +- fs/select.c | 8 ++++---- include/linux/freezer.h | 2 +- include/linux/hrtimer.h | 12 +++++++----- include/linux/poll.h | 2 +- include/linux/sched.h | 4 ++-- kernel/sys.c | 5 ++++- kernel/time/hrtimer.c | 8 ++++---- kernel/time/timer.c | 4 ++-- 9 files changed, 26 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index cde60741cad2..8a74a2a52e0f 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1616,7 +1616,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, { int res = 0, eavail, timed_out = 0; unsigned long flags; - long slack = 0; + u64 slack = 0; wait_queue_t wait; ktime_t expires, *to = NULL; diff --git a/fs/select.c b/fs/select.c index 79d0d4953cad..869293988c2a 100644 --- a/fs/select.c +++ b/fs/select.c @@ -70,9 +70,9 @@ static long __estimate_accuracy(struct timespec *tv) return slack; } -long select_estimate_accuracy(struct timespec *tv) +u64 select_estimate_accuracy(struct timespec *tv) { - unsigned long ret; + u64 ret; struct timespec now; /* @@ -402,7 +402,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) struct poll_wqueues table; poll_table *wait; int retval, i, timed_out = 0; - unsigned long slack = 0; + u64 slack = 0; unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_end = 0; @@ -784,7 +784,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait, poll_table* pt = &wait->pt; ktime_t expire, *to = NULL; int timed_out = 0, count = 0; - unsigned long slack = 0; + u64 slack = 0; unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_end = 0; diff --git a/include/linux/freezer.h b/include/linux/freezer.h index 6b7fd9cf5ea2..dd03e837ebb7 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -231,7 +231,7 @@ static inline long freezable_schedule_timeout_killable_unsafe(long timeout) * call this with locks held. */ static inline int freezable_schedule_hrtimeout_range(ktime_t *expires, - unsigned long delta, const enum hrtimer_mode mode) + u64 delta, const enum hrtimer_mode mode) { int __retval; freezer_do_not_count(); diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 2ead22dd74a0..c98c6539e2c2 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -220,7 +220,7 @@ static inline void hrtimer_set_expires_range(struct hrtimer *timer, ktime_t time timer->node.expires = ktime_add_safe(time, delta); } -static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, unsigned long delta) +static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, u64 delta) { timer->_softexpires = time; timer->node.expires = ktime_add_safe(time, ns_to_ktime(delta)); @@ -378,7 +378,7 @@ static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { } /* Basic timer operations: */ extern void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long range_ns, const enum hrtimer_mode mode); + u64 range_ns, const enum hrtimer_mode mode); /** * hrtimer_start - (re)start an hrtimer on the current CPU @@ -399,7 +399,7 @@ extern int hrtimer_try_to_cancel(struct hrtimer *timer); static inline void hrtimer_start_expires(struct hrtimer *timer, enum hrtimer_mode mode) { - unsigned long delta; + u64 delta; ktime_t soft, hard; soft = hrtimer_get_softexpires(timer); hard = hrtimer_get_expires(timer); @@ -477,10 +477,12 @@ extern long hrtimer_nanosleep_restart(struct restart_block *restart_block); extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *tsk); -extern int schedule_hrtimeout_range(ktime_t *expires, unsigned long delta, +extern int schedule_hrtimeout_range(ktime_t *expires, u64 delta, const enum hrtimer_mode mode); extern int schedule_hrtimeout_range_clock(ktime_t *expires, - unsigned long delta, const enum hrtimer_mode mode, int clock); + u64 delta, + const enum hrtimer_mode mode, + int clock); extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode); /* Soft interrupt function to run the hrtimer queues: */ diff --git a/include/linux/poll.h b/include/linux/poll.h index c08386fb3e08..9fb4f40d9a26 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -96,7 +96,7 @@ extern void poll_initwait(struct poll_wqueues *pwq); extern void poll_freewait(struct poll_wqueues *pwq); extern int poll_schedule_timeout(struct poll_wqueues *pwq, int state, ktime_t *expires, unsigned long slack); -extern long select_estimate_accuracy(struct timespec *tv); +extern u64 select_estimate_accuracy(struct timespec *tv); static inline int poll_schedule(struct poll_wqueues *pwq, int state) diff --git a/include/linux/sched.h b/include/linux/sched.h index eb7f2f84009b..3284d07edec7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1792,8 +1792,8 @@ struct task_struct { * time slack values; these are used to round up poll() and * select() etc timeout values. These are in nanoseconds. */ - unsigned long timer_slack_ns; - unsigned long default_timer_slack_ns; + u64 timer_slack_ns; + u64 default_timer_slack_ns; #ifdef CONFIG_KASAN unsigned int kasan_depth; diff --git a/kernel/sys.c b/kernel/sys.c index 78947de6f969..cf8ba545c7d3 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2169,7 +2169,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = perf_event_task_enable(); break; case PR_GET_TIMERSLACK: - error = current->timer_slack_ns; + if (current->timer_slack_ns > ULONG_MAX) + error = ULONG_MAX; + else + error = current->timer_slack_ns; break; case PR_SET_TIMERSLACK: if (arg2 <= 0) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index fa909f9fd559..58a321c34cfb 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -979,7 +979,7 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, * relative (HRTIMER_MODE_REL) */ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long delta_ns, const enum hrtimer_mode mode) + u64 delta_ns, const enum hrtimer_mode mode) { struct hrtimer_clock_base *base, *new_base; unsigned long flags; @@ -1548,7 +1548,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, struct restart_block *restart; struct hrtimer_sleeper t; int ret = 0; - unsigned long slack; + u64 slack; slack = current->timer_slack_ns; if (dl_task(current) || rt_task(current)) @@ -1724,7 +1724,7 @@ void __init hrtimers_init(void) * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME */ int __sched -schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, +schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, const enum hrtimer_mode mode, int clock) { struct hrtimer_sleeper t; @@ -1792,7 +1792,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, * * Returns 0 when the timer has expired otherwise -EINTR */ -int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta, +int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta, const enum hrtimer_mode mode) { return schedule_hrtimeout_range_clock(expires, delta, mode, diff --git a/kernel/time/timer.c b/kernel/time/timer.c index bbc5d1114583..d1798fa0c743 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1698,10 +1698,10 @@ EXPORT_SYMBOL(msleep_interruptible); static void __sched do_usleep_range(unsigned long min, unsigned long max) { ktime_t kmin; - unsigned long delta; + u64 delta; kmin = ktime_set(0, min * NSEC_PER_USEC); - delta = (max - min) * NSEC_PER_USEC; + delta = (u64)(max - min) * NSEC_PER_USEC; schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL); } -- cgit v1.2.3-70-g09d2 From dfbf2897d00499f94cd53a9806e697392cbfe6a3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 17 Mar 2016 14:21:12 -0700 Subject: bug: set warn variable before calling WARN() This has hit me a couple of times already. I would be debugging code and the system would simply hang and then reboot. Finally, I found that the problem was caused by WARN_ON_ONCE() and friends. The macro WARN_ON_ONCE(condition) is defined as: static bool __section(.data.unlikely) __warned; int __ret_warn_once = !!(condition); if (unlikely(__ret_warn_once)) if (WARN_ON(!__warned)) __warned = true; unlikely(__ret_warn_once); Which looks great and all. But what I have hit, is an issue when WARN_ON() itself hits the same WARN_ON_ONCE() code. Because, the variable __warned is not yet set. Then it too calls WARN_ON() and that triggers the warning again. It keeps doing this until the stack is overflowed and the system crashes. By setting __warned first before calling WARN_ON() makes the original WARN_ON_ONCE() really only warn once, and not an infinite amount of times if the WARN_ON() also triggers the warning. Signed-off-by: Steven Rostedt Acked-by: Peter Zijlstra (Intel) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/bug.h | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 630dd2372238..fdf6fa078422 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -110,9 +110,10 @@ extern void warn_slowpath_null(const char *file, const int line); static bool __section(.data.unlikely) __warned; \ int __ret_warn_once = !!(condition); \ \ - if (unlikely(__ret_warn_once)) \ - if (WARN_ON(!__warned)) \ - __warned = true; \ + if (unlikely(__ret_warn_once && !__warned)) { \ + __warned = true; \ + WARN_ON(1); \ + } \ unlikely(__ret_warn_once); \ }) @@ -120,9 +121,10 @@ extern void warn_slowpath_null(const char *file, const int line); static bool __section(.data.unlikely) __warned; \ int __ret_warn_once = !!(condition); \ \ - if (unlikely(__ret_warn_once)) \ - if (WARN(!__warned, format)) \ - __warned = true; \ + if (unlikely(__ret_warn_once && !__warned)) { \ + __warned = true; \ + WARN(1, format); \ + } \ unlikely(__ret_warn_once); \ }) @@ -130,9 +132,10 @@ extern void warn_slowpath_null(const char *file, const int line); static bool __section(.data.unlikely) __warned; \ int __ret_warn_once = !!(condition); \ \ - if (unlikely(__ret_warn_once)) \ - if (WARN_TAINT(!__warned, taint, format)) \ - __warned = true; \ + if (unlikely(__ret_warn_once && !__warned)) { \ + __warned = true; \ + WARN_TAINT(1, taint, format); \ + } \ unlikely(__ret_warn_once); \ }) -- cgit v1.2.3-70-g09d2 From 93e205a728e6cb8d7d11f6836e289798a1de25e2 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 17 Mar 2016 14:21:15 -0700 Subject: fix Christoph's email addresses There are various email addresses for me throughout the kernel. Use the one that will always be valid. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroup-v1/cgroups.txt | 2 +- Documentation/cgroup-v1/cpusets.txt | 2 +- MAINTAINERS | 2 +- arch/ia64/include/asm/rwsem.h | 2 +- include/linux/quicklist.h | 2 +- mm/mmu_notifier.c | 2 +- mm/quicklist.c | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/Documentation/cgroup-v1/cgroups.txt b/Documentation/cgroup-v1/cgroups.txt index c6256ae9885b..947e6fe31ef9 100644 --- a/Documentation/cgroup-v1/cgroups.txt +++ b/Documentation/cgroup-v1/cgroups.txt @@ -8,7 +8,7 @@ Original copyright statements from cpusets.txt: Portions Copyright (C) 2004 BULL SA. Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. Modified by Paul Jackson -Modified by Christoph Lameter +Modified by Christoph Lameter CONTENTS: ========= diff --git a/Documentation/cgroup-v1/cpusets.txt b/Documentation/cgroup-v1/cpusets.txt index fdf7dff3f607..e5cdcd445615 100644 --- a/Documentation/cgroup-v1/cpusets.txt +++ b/Documentation/cgroup-v1/cpusets.txt @@ -6,7 +6,7 @@ Written by Simon.Derr@bull.net Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. Modified by Paul Jackson -Modified by Christoph Lameter +Modified by Christoph Lameter Modified by Paul Menage Modified by Hidetoshi Seto diff --git a/MAINTAINERS b/MAINTAINERS index 99bd725affc6..67d34bb7335c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8473,7 +8473,7 @@ F: include/crypto/pcrypt.h PER-CPU MEMORY ALLOCATOR M: Tejun Heo -M: Christoph Lameter +M: Christoph Lameter T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu.git S: Maintained F: include/linux/percpu*.h diff --git a/arch/ia64/include/asm/rwsem.h b/arch/ia64/include/asm/rwsem.h index 3027e7516d85..ce112472bdd6 100644 --- a/arch/ia64/include/asm/rwsem.h +++ b/arch/ia64/include/asm/rwsem.h @@ -3,7 +3,7 @@ * * Copyright (C) 2003 Ken Chen * Copyright (C) 2003 Asit Mallick - * Copyright (C) 2005 Christoph Lameter + * Copyright (C) 2005 Christoph Lameter * * Based on asm-i386/rwsem.h and other architecture implementation. * diff --git a/include/linux/quicklist.h b/include/linux/quicklist.h index bd466439c588..3bdfa70bc642 100644 --- a/include/linux/quicklist.h +++ b/include/linux/quicklist.h @@ -5,7 +5,7 @@ * as needed after allocation when they are freed. Per cpu lists of pages * are kept that only contain node local pages. * - * (C) 2007, SGI. Christoph Lameter + * (C) 2007, SGI. Christoph Lameter */ #include #include diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 5fbdd367bbed..f4259e496f83 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -3,7 +3,7 @@ * * Copyright (C) 2008 Qumranet, Inc. * Copyright (C) 2008 SGI - * Christoph Lameter + * Christoph Lameter * * This work is licensed under the terms of the GNU GPL, version 2. See * the COPYING file in the top-level directory. diff --git a/mm/quicklist.c b/mm/quicklist.c index 942212970529..daf6ff6e199a 100644 --- a/mm/quicklist.c +++ b/mm/quicklist.c @@ -8,7 +8,7 @@ * improved on it. * * Copyright (C) 2007 SGI, - * Christoph Lameter + * Christoph Lameter * Generalized, added support for multiple lists and * constructors / destructors. */ -- cgit v1.2.3-70-g09d2 From faeb50b98a337abf7a11375e06a83cda23c8ca67 Mon Sep 17 00:00:00 2001 From: Rob Landley Date: Thu, 17 Mar 2016 14:21:17 -0700 Subject: include/uapi/linux/elf-em.h: remove v850 The v850 port was removed by commits f606ddf42fd4 and 07a887d399b8 in 2008. These #defines are not used in the current kernel. Signed-off-by: Rob Landley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/elf-em.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/elf-em.h b/include/uapi/linux/elf-em.h index b56dfcfe922a..c3fdfe79e5cc 100644 --- a/include/uapi/linux/elf-em.h +++ b/include/uapi/linux/elf-em.h @@ -30,7 +30,6 @@ #define EM_X86_64 62 /* AMD x86-64 */ #define EM_S390 22 /* IBM S/390 */ #define EM_CRIS 76 /* Axis Communications 32-bit embedded processor */ -#define EM_V850 87 /* NEC v850 */ #define EM_M32R 88 /* Renesas M32R */ #define EM_MN10300 89 /* Panasonic/MEI MN10300, AM33 */ #define EM_OPENRISC 92 /* OpenRISC 32-bit embedded processor */ @@ -50,8 +49,6 @@ */ #define EM_ALPHA 0x9026 -/* Bogus old v850 magic number, used by old tools. */ -#define EM_CYGNUS_V850 0x9080 /* Bogus old m32r magic number, used by old tools. */ #define EM_CYGNUS_M32R 0x9041 /* This is the old interim value for S/390 architecture */ -- cgit v1.2.3-70-g09d2 From 26a247fd9f8d2df1965b7f22c9be2fb3a48603f3 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Thu, 17 Mar 2016 14:21:35 -0700 Subject: include/linux/list_bl.h: use bool instead of int for boolean functions hlist_bl_unhashed() and hlist_bl_empty() are all boolean functions, so return bool instead of int. Signed-off-by: Chen Gang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/list_bl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h index ee7229a6c06a..cb483305e1f5 100644 --- a/include/linux/list_bl.h +++ b/include/linux/list_bl.h @@ -48,7 +48,7 @@ static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h) #define hlist_bl_entry(ptr, type, member) container_of(ptr,type,member) -static inline int hlist_bl_unhashed(const struct hlist_bl_node *h) +static inline bool hlist_bl_unhashed(const struct hlist_bl_node *h) { return !h->pprev; } @@ -68,7 +68,7 @@ static inline void hlist_bl_set_first(struct hlist_bl_head *h, h->first = (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK); } -static inline int hlist_bl_empty(const struct hlist_bl_head *h) +static inline bool hlist_bl_empty(const struct hlist_bl_head *h) { return !((unsigned long)READ_ONCE(h->first) & ~LIST_BL_LOCKMASK); } -- cgit v1.2.3-70-g09d2 From f67c07f07fca95a7f330b8bb928eabaf9fcce75d Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 17 Mar 2016 14:21:42 -0700 Subject: radix-tree: add an explicit include of bitops.h The radix-tree header uses the __ffs() function, which is defined in bitops.h. The current kernel headers implicitly include bitops.h, but the userspace test harness does not. Signed-off-by: Matthew Wilcox Cc: Johannes Weiner Cc: Matthew Wilcox Cc: "Kirill A. Shutemov" Cc: Ross Zwisler Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index f54be7082207..39598b9cf1d9 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -21,6 +21,7 @@ #ifndef _LINUX_RADIX_TREE_H #define _LINUX_RADIX_TREE_H +#include #include #include #include -- cgit v1.2.3-70-g09d2 From e61452365372570253b2b1de84bab0cdb2e62c64 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 17 Mar 2016 14:21:54 -0700 Subject: radix_tree: add support for multi-order entries With huge pages, it is convenient to have the radix tree be able to return an entry that covers multiple indices. Previous attempts to deal with the problem have involved inserting N duplicate entries, which is a waste of memory and leads to problems trying to handle aliased tags, or probing the tree multiple times to find alternative entries which might cover the requested index. This approach inserts one canonical entry into the tree for a given range of indices, and may also insert other entries in order to ensure that lookups find the canonical entry. This solution only tolerates inserting powers of two that are greater than the fanout of the tree. If we wish to expand the radix tree's abilities to support large-ish pages that is less than the fanout at the penultimate level of the tree, then we would need to add one more step in lookup to ensure that any sibling nodes in the final level of the tree are dereferenced and we return the canonical entry that they reference. Signed-off-by: Matthew Wilcox Cc: Johannes Weiner Cc: Matthew Wilcox Cc: "Kirill A. Shutemov" Cc: Ross Zwisler Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 11 ++++- lib/radix-tree.c | 109 ++++++++++++++++++++++++++++++++++----------- mm/filemap.c | 2 +- 3 files changed, 93 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 39598b9cf1d9..b211f145c811 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -271,8 +271,15 @@ static inline void radix_tree_replace_slot(void **pslot, void *item) } int __radix_tree_create(struct radix_tree_root *root, unsigned long index, - struct radix_tree_node **nodep, void ***slotp); -int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); + unsigned order, struct radix_tree_node **nodep, + void ***slotp); +int __radix_tree_insert(struct radix_tree_root *, unsigned long index, + unsigned order, void *); +static inline int radix_tree_insert(struct radix_tree_root *root, + unsigned long index, void *entry) +{ + return __radix_tree_insert(root, index, 0, entry); +} void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index, struct radix_tree_node **nodep, void ***slotp); void *radix_tree_lookup(struct radix_tree_root *, unsigned long); diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 9bb440f317c9..d907dca302d5 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -333,7 +333,8 @@ static inline unsigned long radix_tree_maxindex(unsigned int height) /* * Extend a radix tree so it can store key @index. */ -static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) +static int radix_tree_extend(struct radix_tree_root *root, + unsigned long index, unsigned order) { struct radix_tree_node *node; struct radix_tree_node *slot; @@ -345,7 +346,7 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) while (index > radix_tree_maxindex(height)) height++; - if (root->rnode == NULL) { + if ((root->rnode == NULL) && (order == 0)) { root->height = height; goto out; } @@ -386,6 +387,7 @@ out: * __radix_tree_create - create a slot in a radix tree * @root: radix tree root * @index: index key + * @order: index occupies 2^order aligned slots * @nodep: returns node * @slotp: returns slot * @@ -399,26 +401,29 @@ out: * Returns -ENOMEM, or 0 for success. */ int __radix_tree_create(struct radix_tree_root *root, unsigned long index, - struct radix_tree_node **nodep, void ***slotp) + unsigned order, struct radix_tree_node **nodep, + void ***slotp) { struct radix_tree_node *node = NULL, *slot; unsigned int height, shift, offset; int error; + BUG_ON((0 < order) && (order < RADIX_TREE_MAP_SHIFT)); + /* Make sure the tree is high enough. */ if (index > radix_tree_maxindex(root->height)) { - error = radix_tree_extend(root, index); + error = radix_tree_extend(root, index, order); if (error) return error; } - slot = indirect_to_ptr(root->rnode); + slot = root->rnode; height = root->height; shift = height * RADIX_TREE_MAP_SHIFT; offset = 0; /* uninitialised var warning */ - while (shift > 0) { + while (shift > order) { if (slot == NULL) { /* Have to add a child node. */ if (!(slot = radix_tree_node_alloc(root))) @@ -433,15 +438,31 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index, } else rcu_assign_pointer(root->rnode, ptr_to_indirect(slot)); - } + } else if (!radix_tree_is_indirect_ptr(slot)) + break; /* Go a level down */ + height--; shift -= RADIX_TREE_MAP_SHIFT; offset = (index >> shift) & RADIX_TREE_MAP_MASK; - node = slot; + node = indirect_to_ptr(slot); slot = node->slots[offset]; - slot = indirect_to_ptr(slot); - height--; + } + + /* Insert pointers to the canonical entry */ + if ((shift - order) > 0) { + int i, n = 1 << (shift - order); + offset = offset & ~(n - 1); + slot = ptr_to_indirect(&node->slots[offset]); + for (i = 0; i < n; i++) { + if (node->slots[offset + i]) + return -EEXIST; + } + + for (i = 1; i < n; i++) { + rcu_assign_pointer(node->slots[offset + i], slot); + node->count++; + } } if (nodep) @@ -452,15 +473,16 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index, } /** - * radix_tree_insert - insert into a radix tree + * __radix_tree_insert - insert into a radix tree * @root: radix tree root * @index: index key + * @order: key covers the 2^order indices around index * @item: item to insert * * Insert an item into the radix tree at position @index. */ -int radix_tree_insert(struct radix_tree_root *root, - unsigned long index, void *item) +int __radix_tree_insert(struct radix_tree_root *root, unsigned long index, + unsigned order, void *item) { struct radix_tree_node *node; void **slot; @@ -468,7 +490,7 @@ int radix_tree_insert(struct radix_tree_root *root, BUG_ON(radix_tree_is_indirect_ptr(item)); - error = __radix_tree_create(root, index, &node, &slot); + error = __radix_tree_create(root, index, order, &node, &slot); if (error) return error; if (*slot != NULL) @@ -486,7 +508,7 @@ int radix_tree_insert(struct radix_tree_root *root, return 0; } -EXPORT_SYMBOL(radix_tree_insert); +EXPORT_SYMBOL(__radix_tree_insert); /** * __radix_tree_lookup - lookup an item in a radix tree @@ -537,6 +559,8 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index, node = rcu_dereference_raw(*slot); if (node == NULL) return NULL; + if (!radix_tree_is_indirect_ptr(node)) + break; node = indirect_to_ptr(node); shift -= RADIX_TREE_MAP_SHIFT; @@ -624,6 +648,8 @@ void *radix_tree_tag_set(struct radix_tree_root *root, tag_set(slot, tag, offset); slot = slot->slots[offset]; BUG_ON(slot == NULL); + if (!radix_tree_is_indirect_ptr(slot)) + break; slot = indirect_to_ptr(slot); shift -= RADIX_TREE_MAP_SHIFT; height--; @@ -669,6 +695,8 @@ void *radix_tree_tag_clear(struct radix_tree_root *root, while (shift) { if (slot == NULL) goto out; + if (!radix_tree_is_indirect_ptr(slot)) + break; slot = indirect_to_ptr(slot); shift -= RADIX_TREE_MAP_SHIFT; @@ -753,6 +781,8 @@ int radix_tree_tag_get(struct radix_tree_root *root, if (height == 1) return 1; node = rcu_dereference_raw(node->slots[offset]); + if (!radix_tree_is_indirect_ptr(node)) + return 1; shift -= RADIX_TREE_MAP_SHIFT; height--; } @@ -813,6 +843,7 @@ restart: node = rnode; while (1) { + struct radix_tree_node *slot; if ((flags & RADIX_TREE_ITER_TAGGED) ? !test_bit(offset, node->tags[tag]) : !node->slots[offset]) { @@ -843,10 +874,12 @@ restart: if (!shift) break; - node = rcu_dereference_raw(node->slots[offset]); - if (node == NULL) + slot = rcu_dereference_raw(node->slots[offset]); + if (slot == NULL) goto restart; - node = indirect_to_ptr(node); + if (!radix_tree_is_indirect_ptr(slot)) + break; + node = indirect_to_ptr(slot); shift -= RADIX_TREE_MAP_SHIFT; offset = (index >> shift) & RADIX_TREE_MAP_MASK; } @@ -944,16 +977,20 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, if (!tag_get(slot, iftag, offset)) goto next; if (shift) { - /* Go down one level */ - shift -= RADIX_TREE_MAP_SHIFT; node = slot; slot = slot->slots[offset]; - slot = indirect_to_ptr(slot); - continue; + if (radix_tree_is_indirect_ptr(slot)) { + slot = indirect_to_ptr(slot); + shift -= RADIX_TREE_MAP_SHIFT; + continue; + } else { + slot = node; + node = node->parent; + } } /* tag the leaf */ - tagged++; + tagged += 1 << shift; tag_set(slot, settag, offset); /* walk back up the path tagging interior nodes */ @@ -1201,11 +1238,20 @@ static unsigned long __locate(struct radix_tree_node *slot, void *item, goto out; } - shift -= RADIX_TREE_MAP_SHIFT; slot = rcu_dereference_raw(slot->slots[i]); if (slot == NULL) goto out; + if (!radix_tree_is_indirect_ptr(slot)) { + if (slot == item) { + *found_index = index + i; + index = 0; + } else { + index += shift; + } + goto out; + } slot = indirect_to_ptr(slot); + shift -= RADIX_TREE_MAP_SHIFT; } /* Bottom level: check items */ @@ -1285,7 +1331,8 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) /* * The candidate node has more than one child, or its child - * is not at the leftmost slot, we cannot shrink. + * is not at the leftmost slot, or it is a multiorder entry, + * we cannot shrink. */ if (to_free->count != 1) break; @@ -1301,6 +1348,9 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) * one (root->rnode) as far as dependent read barriers go. */ if (root->height > 1) { + if (!radix_tree_is_indirect_ptr(slot)) + break; + slot = indirect_to_ptr(slot); slot->parent = NULL; slot = ptr_to_indirect(slot); @@ -1399,7 +1449,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root, unsigned long index, void *item) { struct radix_tree_node *node; - unsigned int offset; + unsigned int offset, i; void **slot; void *entry; int tag; @@ -1428,6 +1478,13 @@ void *radix_tree_delete_item(struct radix_tree_root *root, radix_tree_tag_clear(root, index, tag); } + /* Delete any sibling slots pointing to this slot */ + for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) { + if (node->slots[offset + i] != ptr_to_indirect(slot)) + break; + node->slots[offset + i] = NULL; + node->count--; + } node->slots[offset] = NULL; node->count--; diff --git a/mm/filemap.c b/mm/filemap.c index 61b441b191ad..084ad0fe73c7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -586,7 +586,7 @@ static int page_cache_tree_insert(struct address_space *mapping, void **slot; int error; - error = __radix_tree_create(&mapping->page_tree, page->index, + error = __radix_tree_create(&mapping->page_tree, page->index, 0, &node, &slot); if (error) return error; -- cgit v1.2.3-70-g09d2 From 7165092fe5ca70bae722ac6cd78421cfd0eec18d Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 17 Mar 2016 14:22:06 -0700 Subject: radix-tree,shmem: introduce radix_tree_iter_next() shmem likes to occasionally drop the lock, schedule, then reacqire the lock and continue with the iteration from the last place it left off. This is currently done with a pretty ugly goto. Introduce radix_tree_iter_next() and use it throughout shmem.c. [koct9i@gmail.com: fix bug in radix_tree_iter_next() for tagged iteration] Signed-off-by: Matthew Wilcox Cc: Hugh Dickins Signed-off-by: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 16 ++++++++++++++++ mm/shmem.c | 12 +++--------- 2 files changed, 19 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index b211f145c811..51a97ac8bfbf 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -402,6 +402,22 @@ void **radix_tree_iter_retry(struct radix_tree_iter *iter) return NULL; } +/** + * radix_tree_iter_next - resume iterating when the chunk may be invalid + * @iter: iterator state + * + * If the iterator needs to release then reacquire a lock, the chunk may + * have been invalidated by an insertion or deletion. Call this function + * to continue the iteration from the next index. + */ +static inline __must_check +void **radix_tree_iter_next(struct radix_tree_iter *iter) +{ + iter->next_index = iter->index + 1; + iter->tags = 0; + return NULL; +} + /** * radix_tree_chunk_size - get current chunk size * diff --git a/mm/shmem.c b/mm/shmem.c index 91c0dadf48d3..9428c51ab2d6 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -376,7 +376,6 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, rcu_read_lock(); -restart: radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { if (iter.index >= end) break; @@ -393,8 +392,7 @@ restart: if (need_resched()) { cond_resched_rcu(); - start = iter.index + 1; - goto restart; + slot = radix_tree_iter_next(&iter); } } @@ -1944,7 +1942,6 @@ static void shmem_tag_pins(struct address_space *mapping) start = 0; rcu_read_lock(); -restart: radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { page = radix_tree_deref_slot(slot); if (!page || radix_tree_exception(page)) { @@ -1961,8 +1958,7 @@ restart: if (need_resched()) { cond_resched_rcu(); - start = iter.index + 1; - goto restart; + slot = radix_tree_iter_next(&iter); } } rcu_read_unlock(); @@ -1999,7 +1995,6 @@ static int shmem_wait_for_pins(struct address_space *mapping) start = 0; rcu_read_lock(); -restart: radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, start, SHMEM_TAG_PINNED) { @@ -2033,8 +2028,7 @@ restart: continue_resched: if (need_resched()) { cond_resched_rcu(); - start = iter.index + 1; - goto restart; + slot = radix_tree_iter_next(&iter); } } rcu_read_unlock(); -- cgit v1.2.3-70-g09d2 From 56b060814e2d87d6646a85a2f4609c73587399ca Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 17 Mar 2016 14:22:14 -0700 Subject: lib/string: introduce match_string() helper Occasionally we have to search for an occurrence of a string in an array of strings. Make a simple helper for that purpose. Signed-off-by: Andy Shevchenko Cc: "David S. Miller" Cc: Bartlomiej Zolnierkiewicz Cc: David Airlie Cc: David Woodhouse Cc: Dmitry Eremin-Solenikov Cc: Greg Kroah-Hartman Cc: Heikki Krogerus Cc: Linus Walleij Cc: Mika Westerberg Cc: Rafael J. Wysocki Cc: Sebastian Reichel Cc: Tejun Heo Cc: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/string.h | 2 ++ lib/string.c | 26 ++++++++++++++++++++++++++ 2 files changed, 28 insertions(+) (limited to 'include') diff --git a/include/linux/string.h b/include/linux/string.h index 9eebc66d957a..0f235e80d355 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -130,6 +130,8 @@ extern void argv_free(char **argv); extern bool sysfs_streq(const char *s1, const char *s2); extern int strtobool(const char *s, bool *res); +int match_string(const char * const *array, size_t n, const char *string); + #ifdef CONFIG_BINARY_PRINTF int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args); int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf); diff --git a/lib/string.c b/lib/string.c index 0323c0d5629a..e9c9db161d2c 100644 --- a/lib/string.c +++ b/lib/string.c @@ -630,6 +630,32 @@ bool sysfs_streq(const char *s1, const char *s2) } EXPORT_SYMBOL(sysfs_streq); +/** + * match_string - matches given string in an array + * @array: array of strings + * @n: number of strings in the array or -1 for NULL terminated arrays + * @string: string to match with + * + * Return: + * index of a @string in the @array if matches, or %-EINVAL otherwise. + */ +int match_string(const char * const *array, size_t n, const char *string) +{ + int index; + const char *item; + + for (index = 0; index < n; index++) { + item = array[index]; + if (!item) + break; + if (!strcmp(item, string)) + return index; + } + + return -EINVAL; +} +EXPORT_SYMBOL(match_string); + /** * strtobool - convert common user inputs into boolean values * @s: input string -- cgit v1.2.3-70-g09d2 From a644fdf029f44f1ff76659154b411afeaeee1f43 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 17 Mar 2016 14:22:41 -0700 Subject: include/asm-generic/atomic-long.h: force inlining of some atomic_long operations Sometimes gcc mysteriously doesn't inline very small functions we expect to be inlined. See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122 With this .config: http://busybox.net/~vda/kernel_config_OPTIMIZE_INLINING_and_Os, atomic_long_inc(), atomic_long_dec() and atomic_long_add() functions get deinlined about 40 times. Examples of disassembly: (21 copies, 147 calls): 55 push %rbp 48 89 e5 mov %rsp,%rbp f0 48 ff 07 lock incq (%rdi) 5d pop %rbp c3 retq (4 copies, 14 calls) is similar to inc. (11 copies, 41 calls): 55 push %rbp 48 89 e5 mov %rsp,%rbp f0 48 01 3e lock add %rdi,(%rsi) 5d pop %rbp c3 retq This patch fixes this via s/inline/__always_inline/. Code size decrease after the patch is ~1.3k: text data bss dec hex filename 92203657 20826112 36417536 149447305 8e86289 vmlinux 92202377 20826112 36417536 149446025 8e85d89 vmlinux4_atomiclong_after Signed-off-by: Denys Vlasenko Cc: Ingo Molnar Cc: Thomas Graf Cc: Peter Zijlstra Cc: David Rientjes Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/atomic-long.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/asm-generic/atomic-long.h b/include/asm-generic/atomic-long.h index eb1973bad80b..5e1f345b58dd 100644 --- a/include/asm-generic/atomic-long.h +++ b/include/asm-generic/atomic-long.h @@ -98,14 +98,14 @@ ATOMIC_LONG_ADD_SUB_OP(sub, _release) #define atomic_long_xchg(v, new) \ (ATOMIC_LONG_PFX(_xchg)((ATOMIC_LONG_PFX(_t) *)(v), (new))) -static inline void atomic_long_inc(atomic_long_t *l) +static __always_inline void atomic_long_inc(atomic_long_t *l) { ATOMIC_LONG_PFX(_t) *v = (ATOMIC_LONG_PFX(_t) *)l; ATOMIC_LONG_PFX(_inc)(v); } -static inline void atomic_long_dec(atomic_long_t *l) +static __always_inline void atomic_long_dec(atomic_long_t *l) { ATOMIC_LONG_PFX(_t) *v = (ATOMIC_LONG_PFX(_t) *)l; @@ -113,7 +113,7 @@ static inline void atomic_long_dec(atomic_long_t *l) } #define ATOMIC_LONG_OP(op) \ -static inline void \ +static __always_inline void \ atomic_long_##op(long i, atomic_long_t *l) \ { \ ATOMIC_LONG_PFX(_t) *v = (ATOMIC_LONG_PFX(_t) *)l; \ -- cgit v1.2.3-70-g09d2 From bc27fb68aaad44dd8f5c34924f05721f0abaeec1 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 17 Mar 2016 14:22:44 -0700 Subject: include/uapi/linux/byteorder, swab: force inlining of some byteswap operations Sometimes gcc mysteriously doesn't inline very small functions we expect to be inlined. See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122 With this .config: http://busybox.net/~vda/kernel_config_OPTIMIZE_INLINING_and_Os, the following functions get deinlined many times. Examples of disassembly: (12 copies, 51 calls): 66 8b 07 mov (%rdi),%ax 55 push %rbp 48 89 e5 mov %rsp,%rbp 86 e0 xchg %ah,%al 5d pop %rbp c3 retq (12 copies, 135 calls): 8b 07 mov (%rdi),%eax 55 push %rbp 48 89 e5 mov %rsp,%rbp 0f c8 bswap %eax 5d pop %rbp c3 retq (2 copies, 20 calls): 48 8b 07 mov (%rdi),%rax 55 push %rbp 48 89 e5 mov %rsp,%rbp 48 0f c8 bswap %rax 5d pop %rbp c3 retq <__swab16p> (16 copies, 146 calls): 55 push %rbp 89 f8 mov %edi,%eax 86 e0 xchg %ah,%al 48 89 e5 mov %rsp,%rbp 5d pop %rbp c3 retq <__swab32p> (43 copies, ~560 calls): 55 push %rbp 89 f8 mov %edi,%eax 0f c8 bswap %eax 48 89 e5 mov %rsp,%rbp 5d pop %rbp c3 retq <__swab64p> (21 copies, 119 calls): 55 push %rbp 48 89 f8 mov %rdi,%rax 48 0f c8 bswap %rax 48 89 e5 mov %rsp,%rbp 5d pop %rbp c3 retq <__swab32s> (6 copies, 47 calls): 8b 07 mov (%rdi),%eax 55 push %rbp 48 89 e5 mov %rsp,%rbp 0f c8 bswap %eax 89 07 mov %eax,(%rdi) 5d pop %rbp c3 retq This patch fixes this via s/inline/__always_inline/. Code size decrease after the patch is ~4.5k: text data bss dec hex filename 92202377 20826112 36417536 149446025 8e85d89 vmlinux 92197848 20826112 36417536 149441496 8e84bd8 vmlinux5_swap_after Signed-off-by: Denys Vlasenko Acked-by: Ingo Molnar Cc: Thomas Graf Cc: Peter Zijlstra Cc: David Rientjes Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/byteorder/big_endian.h | 24 ++++++++++++------------ include/uapi/linux/byteorder/little_endian.h | 24 ++++++++++++------------ include/uapi/linux/swab.h | 10 +++++----- 3 files changed, 29 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/byteorder/big_endian.h b/include/uapi/linux/byteorder/big_endian.h index 672374450095..cdab17ab907c 100644 --- a/include/uapi/linux/byteorder/big_endian.h +++ b/include/uapi/linux/byteorder/big_endian.h @@ -40,51 +40,51 @@ #define __cpu_to_be16(x) ((__force __be16)(__u16)(x)) #define __be16_to_cpu(x) ((__force __u16)(__be16)(x)) -static inline __le64 __cpu_to_le64p(const __u64 *p) +static __always_inline __le64 __cpu_to_le64p(const __u64 *p) { return (__force __le64)__swab64p(p); } -static inline __u64 __le64_to_cpup(const __le64 *p) +static __always_inline __u64 __le64_to_cpup(const __le64 *p) { return __swab64p((__u64 *)p); } -static inline __le32 __cpu_to_le32p(const __u32 *p) +static __always_inline __le32 __cpu_to_le32p(const __u32 *p) { return (__force __le32)__swab32p(p); } -static inline __u32 __le32_to_cpup(const __le32 *p) +static __always_inline __u32 __le32_to_cpup(const __le32 *p) { return __swab32p((__u32 *)p); } -static inline __le16 __cpu_to_le16p(const __u16 *p) +static __always_inline __le16 __cpu_to_le16p(const __u16 *p) { return (__force __le16)__swab16p(p); } -static inline __u16 __le16_to_cpup(const __le16 *p) +static __always_inline __u16 __le16_to_cpup(const __le16 *p) { return __swab16p((__u16 *)p); } -static inline __be64 __cpu_to_be64p(const __u64 *p) +static __always_inline __be64 __cpu_to_be64p(const __u64 *p) { return (__force __be64)*p; } -static inline __u64 __be64_to_cpup(const __be64 *p) +static __always_inline __u64 __be64_to_cpup(const __be64 *p) { return (__force __u64)*p; } -static inline __be32 __cpu_to_be32p(const __u32 *p) +static __always_inline __be32 __cpu_to_be32p(const __u32 *p) { return (__force __be32)*p; } -static inline __u32 __be32_to_cpup(const __be32 *p) +static __always_inline __u32 __be32_to_cpup(const __be32 *p) { return (__force __u32)*p; } -static inline __be16 __cpu_to_be16p(const __u16 *p) +static __always_inline __be16 __cpu_to_be16p(const __u16 *p) { return (__force __be16)*p; } -static inline __u16 __be16_to_cpup(const __be16 *p) +static __always_inline __u16 __be16_to_cpup(const __be16 *p) { return (__force __u16)*p; } diff --git a/include/uapi/linux/byteorder/little_endian.h b/include/uapi/linux/byteorder/little_endian.h index d876736a0017..4b93f2b260dd 100644 --- a/include/uapi/linux/byteorder/little_endian.h +++ b/include/uapi/linux/byteorder/little_endian.h @@ -40,51 +40,51 @@ #define __cpu_to_be16(x) ((__force __be16)__swab16((x))) #define __be16_to_cpu(x) __swab16((__force __u16)(__be16)(x)) -static inline __le64 __cpu_to_le64p(const __u64 *p) +static __always_inline __le64 __cpu_to_le64p(const __u64 *p) { return (__force __le64)*p; } -static inline __u64 __le64_to_cpup(const __le64 *p) +static __always_inline __u64 __le64_to_cpup(const __le64 *p) { return (__force __u64)*p; } -static inline __le32 __cpu_to_le32p(const __u32 *p) +static __always_inline __le32 __cpu_to_le32p(const __u32 *p) { return (__force __le32)*p; } -static inline __u32 __le32_to_cpup(const __le32 *p) +static __always_inline __u32 __le32_to_cpup(const __le32 *p) { return (__force __u32)*p; } -static inline __le16 __cpu_to_le16p(const __u16 *p) +static __always_inline __le16 __cpu_to_le16p(const __u16 *p) { return (__force __le16)*p; } -static inline __u16 __le16_to_cpup(const __le16 *p) +static __always_inline __u16 __le16_to_cpup(const __le16 *p) { return (__force __u16)*p; } -static inline __be64 __cpu_to_be64p(const __u64 *p) +static __always_inline __be64 __cpu_to_be64p(const __u64 *p) { return (__force __be64)__swab64p(p); } -static inline __u64 __be64_to_cpup(const __be64 *p) +static __always_inline __u64 __be64_to_cpup(const __be64 *p) { return __swab64p((__u64 *)p); } -static inline __be32 __cpu_to_be32p(const __u32 *p) +static __always_inline __be32 __cpu_to_be32p(const __u32 *p) { return (__force __be32)__swab32p(p); } -static inline __u32 __be32_to_cpup(const __be32 *p) +static __always_inline __u32 __be32_to_cpup(const __be32 *p) { return __swab32p((__u32 *)p); } -static inline __be16 __cpu_to_be16p(const __u16 *p) +static __always_inline __be16 __cpu_to_be16p(const __u16 *p) { return (__force __be16)__swab16p(p); } -static inline __u16 __be16_to_cpup(const __be16 *p) +static __always_inline __u16 __be16_to_cpup(const __be16 *p) { return __swab16p((__u16 *)p); } diff --git a/include/uapi/linux/swab.h b/include/uapi/linux/swab.h index 0e011eb91b5d..3f10e5317b46 100644 --- a/include/uapi/linux/swab.h +++ b/include/uapi/linux/swab.h @@ -151,7 +151,7 @@ static inline __attribute_const__ __u32 __fswahb32(__u32 val) * __swab16p - return a byteswapped 16-bit value from a pointer * @p: pointer to a naturally-aligned 16-bit value */ -static inline __u16 __swab16p(const __u16 *p) +static __always_inline __u16 __swab16p(const __u16 *p) { #ifdef __arch_swab16p return __arch_swab16p(p); @@ -164,7 +164,7 @@ static inline __u16 __swab16p(const __u16 *p) * __swab32p - return a byteswapped 32-bit value from a pointer * @p: pointer to a naturally-aligned 32-bit value */ -static inline __u32 __swab32p(const __u32 *p) +static __always_inline __u32 __swab32p(const __u32 *p) { #ifdef __arch_swab32p return __arch_swab32p(p); @@ -177,7 +177,7 @@ static inline __u32 __swab32p(const __u32 *p) * __swab64p - return a byteswapped 64-bit value from a pointer * @p: pointer to a naturally-aligned 64-bit value */ -static inline __u64 __swab64p(const __u64 *p) +static __always_inline __u64 __swab64p(const __u64 *p) { #ifdef __arch_swab64p return __arch_swab64p(p); @@ -232,7 +232,7 @@ static inline void __swab16s(__u16 *p) * __swab32s - byteswap a 32-bit value in-place * @p: pointer to a naturally-aligned 32-bit value */ -static inline void __swab32s(__u32 *p) +static __always_inline void __swab32s(__u32 *p) { #ifdef __arch_swab32s __arch_swab32s(p); @@ -245,7 +245,7 @@ static inline void __swab32s(__u32 *p) * __swab64s - byteswap a 64-bit value in-place * @p: pointer to a naturally-aligned 64-bit value */ -static inline void __swab64s(__u64 *p) +static __always_inline void __swab64s(__u64 *p) { #ifdef __arch_swab64s __arch_swab64s(p); -- cgit v1.2.3-70-g09d2 From e3bde9568d992c5f985e6e30731a5f9f9bef7b13 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 17 Mar 2016 14:22:47 -0700 Subject: include/linux/unaligned: force inlining of byteswap operations Sometimes gcc mysteriously doesn't inline very small functions we expect to be inlined. See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122 With this .config: http://busybox.net/~vda/kernel_config_OPTIMIZE_INLINING_and_Os, the following functions get deinlined many times. Examples of disassembly: (24 copies, 108 calls): 66 8b 07 mov (%rdi),%ax 55 push %rbp 48 89 e5 mov %rsp,%rbp 86 e0 xchg %ah,%al 5d pop %rbp c3 retq (25 copies, 181 calls): 8b 07 mov (%rdi),%eax 55 push %rbp 48 89 e5 mov %rsp,%rbp 0f c8 bswap %eax 5d pop %rbp c3 retq (23 copies, 94 calls): 48 8b 07 mov (%rdi),%rax 55 push %rbp 48 89 e5 mov %rsp,%rbp 48 0f c8 bswap %rax 5d pop %rbp c3 retq (2 copies, 11 calls): 89 f8 mov %edi,%eax 55 push %rbp c1 ef 08 shr $0x8,%edi c1 e0 08 shl $0x8,%eax 09 c7 or %eax,%edi 48 89 e5 mov %rsp,%rbp 66 89 3e mov %di,(%rsi) (8 copies, 43 calls): 55 push %rbp 0f cf bswap %edi 89 3e mov %edi,(%rsi) 48 89 e5 mov %rsp,%rbp 5d pop %rbp c3 retq (26 copies, 157 calls): 55 push %rbp 48 0f cf bswap %rdi 48 89 3e mov %rdi,(%rsi) 48 89 e5 mov %rsp,%rbp 5d pop %rbp c3 retq This patch fixes this via s/inline/__always_inline/. It only affects arches with efficient unaligned access insns, such as x86. (arched which lack such ops do not include linux/unaligned/access_ok.h) Code size decrease after the patch is ~8.5k: text data bss dec hex filename 92197848 20826112 36417536 149441496 8e84bd8 vmlinux 92189231 20826144 36417536 149432911 8e82a4f vmlinux6_unaligned_be_after Signed-off-by: Denys Vlasenko Acked-by: Ingo Molnar Cc: Thomas Graf Cc: Peter Zijlstra Cc: David Rientjes Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/unaligned/access_ok.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/unaligned/access_ok.h b/include/linux/unaligned/access_ok.h index 99c1b4d20b0f..33383ca23837 100644 --- a/include/linux/unaligned/access_ok.h +++ b/include/linux/unaligned/access_ok.h @@ -4,62 +4,62 @@ #include #include -static inline u16 get_unaligned_le16(const void *p) +static __always_inline u16 get_unaligned_le16(const void *p) { return le16_to_cpup((__le16 *)p); } -static inline u32 get_unaligned_le32(const void *p) +static __always_inline u32 get_unaligned_le32(const void *p) { return le32_to_cpup((__le32 *)p); } -static inline u64 get_unaligned_le64(const void *p) +static __always_inline u64 get_unaligned_le64(const void *p) { return le64_to_cpup((__le64 *)p); } -static inline u16 get_unaligned_be16(const void *p) +static __always_inline u16 get_unaligned_be16(const void *p) { return be16_to_cpup((__be16 *)p); } -static inline u32 get_unaligned_be32(const void *p) +static __always_inline u32 get_unaligned_be32(const void *p) { return be32_to_cpup((__be32 *)p); } -static inline u64 get_unaligned_be64(const void *p) +static __always_inline u64 get_unaligned_be64(const void *p) { return be64_to_cpup((__be64 *)p); } -static inline void put_unaligned_le16(u16 val, void *p) +static __always_inline void put_unaligned_le16(u16 val, void *p) { *((__le16 *)p) = cpu_to_le16(val); } -static inline void put_unaligned_le32(u32 val, void *p) +static __always_inline void put_unaligned_le32(u32 val, void *p) { *((__le32 *)p) = cpu_to_le32(val); } -static inline void put_unaligned_le64(u64 val, void *p) +static __always_inline void put_unaligned_le64(u64 val, void *p) { *((__le64 *)p) = cpu_to_le64(val); } -static inline void put_unaligned_be16(u16 val, void *p) +static __always_inline void put_unaligned_be16(u16 val, void *p) { *((__be16 *)p) = cpu_to_be16(val); } -static inline void put_unaligned_be32(u32 val, void *p) +static __always_inline void put_unaligned_be32(u32 val, void *p) { *((__be32 *)p) = cpu_to_be32(val); } -static inline void put_unaligned_be64(u64 val, void *p) +static __always_inline void put_unaligned_be64(u64 val, void *p) { *((__be64 *)p) = cpu_to_be64(val); } -- cgit v1.2.3-70-g09d2 From ef951599074ba4fad2d0efa0a977129b41e6d203 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Mar 2016 14:22:50 -0700 Subject: lib: move strtobool() to kstrtobool() Create the kstrtobool_from_user() helper and move strtobool() logic into the new kstrtobool() (matching all the other kstrto* functions). Provides an inline wrapper for existing strtobool() callers. Signed-off-by: Kees Cook Cc: Joe Perches Cc: Andy Shevchenko Cc: Rasmus Villemoes Cc: Daniel Borkmann Cc: Amitkumar Karwar Cc: Nishant Sarmukadam Cc: Kalle Valo Cc: Steve French Cc: Michael Ellerman Cc: Heiko Carstens Cc: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 2 ++ include/linux/string.h | 6 +++++- lib/kstrtox.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ lib/string.c | 29 ----------------------------- 4 files changed, 57 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index f31638c6e873..f4fa2b29c38c 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -357,6 +357,7 @@ int __must_check kstrtou16(const char *s, unsigned int base, u16 *res); int __must_check kstrtos16(const char *s, unsigned int base, s16 *res); int __must_check kstrtou8(const char *s, unsigned int base, u8 *res); int __must_check kstrtos8(const char *s, unsigned int base, s8 *res); +int __must_check kstrtobool(const char *s, bool *res); int __must_check kstrtoull_from_user(const char __user *s, size_t count, unsigned int base, unsigned long long *res); int __must_check kstrtoll_from_user(const char __user *s, size_t count, unsigned int base, long long *res); @@ -368,6 +369,7 @@ int __must_check kstrtou16_from_user(const char __user *s, size_t count, unsigne int __must_check kstrtos16_from_user(const char __user *s, size_t count, unsigned int base, s16 *res); int __must_check kstrtou8_from_user(const char __user *s, size_t count, unsigned int base, u8 *res); int __must_check kstrtos8_from_user(const char __user *s, size_t count, unsigned int base, s8 *res); +int __must_check kstrtobool_from_user(const char __user *s, size_t count, bool *res); static inline int __must_check kstrtou64_from_user(const char __user *s, size_t count, unsigned int base, u64 *res) { diff --git a/include/linux/string.h b/include/linux/string.h index 0f235e80d355..d3993a79a325 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -128,7 +128,11 @@ extern char **argv_split(gfp_t gfp, const char *str, int *argcp); extern void argv_free(char **argv); extern bool sysfs_streq(const char *s1, const char *s2); -extern int strtobool(const char *s, bool *res); +extern int kstrtobool(const char *s, bool *res); +static inline int strtobool(const char *s, bool *res) +{ + return kstrtobool(s, res); +} int match_string(const char * const *array, size_t n, const char *string); diff --git a/lib/kstrtox.c b/lib/kstrtox.c index 94be244e8441..e8ba4a013e82 100644 --- a/lib/kstrtox.c +++ b/lib/kstrtox.c @@ -321,6 +321,56 @@ int kstrtos8(const char *s, unsigned int base, s8 *res) } EXPORT_SYMBOL(kstrtos8); +/** + * kstrtobool - convert common user inputs into boolean values + * @s: input string + * @res: result + * + * This routine returns 0 iff the first character is one of 'Yy1Nn0'. + * Otherwise it will return -EINVAL. Value pointed to by res is + * updated upon finding a match. + */ +int kstrtobool(const char *s, bool *res) +{ + if (!s) + return -EINVAL; + + switch (s[0]) { + case 'y': + case 'Y': + case '1': + *res = true; + return 0; + case 'n': + case 'N': + case '0': + *res = false; + return 0; + default: + break; + } + + return -EINVAL; +} +EXPORT_SYMBOL(kstrtobool); + +/* + * Since "base" would be a nonsense argument, this open-codes the + * _from_user helper instead of using the helper macro below. + */ +int kstrtobool_from_user(const char __user *s, size_t count, bool *res) +{ + /* Longest string needed to differentiate, newline, terminator */ + char buf[4]; + + count = min(count, sizeof(buf) - 1); + if (copy_from_user(buf, s, count)) + return -EFAULT; + buf[count] = '\0'; + return kstrtobool(buf, res); +} +EXPORT_SYMBOL(kstrtobool_from_user); + #define kstrto_from_user(f, g, type) \ int f(const char __user *s, size_t count, unsigned int base, type *res) \ { \ diff --git a/lib/string.c b/lib/string.c index e9c9db161d2c..ed83562a53ae 100644 --- a/lib/string.c +++ b/lib/string.c @@ -656,35 +656,6 @@ int match_string(const char * const *array, size_t n, const char *string) } EXPORT_SYMBOL(match_string); -/** - * strtobool - convert common user inputs into boolean values - * @s: input string - * @res: result - * - * This routine returns 0 iff the first character is one of 'Yy1Nn0'. - * Otherwise it will return -EINVAL. Value pointed to by res is - * updated upon finding a match. - */ -int strtobool(const char *s, bool *res) -{ - switch (s[0]) { - case 'y': - case 'Y': - case '1': - *res = true; - break; - case 'n': - case 'N': - case '0': - *res = false; - break; - default: - return -EINVAL; - } - return 0; -} -EXPORT_SYMBOL(strtobool); - #ifndef __HAVE_ARCH_MEMSET /** * memset - Fill a region of memory with the given value -- cgit v1.2.3-70-g09d2 From 4cc7ecb7f2a60e8deb783b8fbf7c1ae467acb920 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Mar 2016 14:23:00 -0700 Subject: param: convert some "on"/"off" users to strtobool This changes several users of manual "on"/"off" parsing to use strtobool. Some side-effects: - these uses will now parse y/n/1/0 meaningfully too - the early_param uses will now bubble up parse errors Signed-off-by: Kees Cook Acked-by: Heiko Carstens Acked-by: Michael Ellerman Cc: Amitkumar Karwar Cc: Andy Shevchenko Cc: Daniel Borkmann Cc: Joe Perches Cc: Kalle Valo Cc: Martin Schwidefsky Cc: Nishant Sarmukadam Cc: Rasmus Villemoes Cc: Steve French Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/kernel/rtasd.c | 9 ++------- arch/powerpc/platforms/pseries/hotplug-cpu.c | 10 ++-------- arch/s390/kernel/time.c | 8 ++------ arch/s390/kernel/topology.c | 7 ++----- arch/x86/kernel/aperture_64.c | 12 ++---------- include/linux/tick.h | 2 +- kernel/time/hrtimer.c | 10 ++-------- kernel/time/tick-sched.c | 10 ++-------- 8 files changed, 15 insertions(+), 53 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c index 5a2c049c1c61..aa610ce8742f 100644 --- a/arch/powerpc/kernel/rtasd.c +++ b/arch/powerpc/kernel/rtasd.c @@ -49,7 +49,7 @@ static unsigned int rtas_error_log_buffer_max; static unsigned int event_scan; static unsigned int rtas_event_scan_rate; -static int full_rtas_msgs = 0; +static bool full_rtas_msgs; /* Stop logging to nvram after first fatal error */ static int logging_enabled; /* Until we initialize everything, @@ -592,11 +592,6 @@ __setup("surveillance=", surveillance_setup); static int __init rtasmsgs_setup(char *str) { - if (strcmp(str, "on") == 0) - full_rtas_msgs = 1; - else if (strcmp(str, "off") == 0) - full_rtas_msgs = 0; - - return 1; + return (kstrtobool(str, &full_rtas_msgs) == 0); } __setup("rtasmsgs=", rtasmsgs_setup); diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index 32274f72fe3f..282837a1d74b 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -47,20 +47,14 @@ static DEFINE_PER_CPU(enum cpu_state_vals, current_state) = CPU_STATE_OFFLINE; static enum cpu_state_vals default_offline_state = CPU_STATE_OFFLINE; -static int cede_offline_enabled __read_mostly = 1; +static bool cede_offline_enabled __read_mostly = true; /* * Enable/disable cede_offline when available. */ static int __init setup_cede_offline(char *str) { - if (!strcmp(str, "off")) - cede_offline_enabled = 0; - else if (!strcmp(str, "on")) - cede_offline_enabled = 1; - else - return 0; - return 1; + return (kstrtobool(str, &cede_offline_enabled) == 0); } __setup("cede_offline=", setup_cede_offline); diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index c4e5f183f225..9409d32f285e 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -1432,7 +1432,7 @@ device_initcall(etr_init_sysfs); /* * Server Time Protocol (STP) code. */ -static int stp_online; +static bool stp_online; static struct stp_sstpi stp_info; static void *stp_page; @@ -1443,11 +1443,7 @@ static struct timer_list stp_timer; static int __init early_parse_stp(char *p) { - if (strncmp(p, "off", 3) == 0) - stp_online = 0; - else if (strncmp(p, "on", 2) == 0) - stp_online = 1; - return 0; + return kstrtobool(p, &stp_online); } early_param("stp", early_parse_stp); diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 40b8102fdadb..64298a867589 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -37,7 +37,7 @@ static void set_topology_timer(void); static void topology_work_fn(struct work_struct *work); static struct sysinfo_15_1_x *tl_info; -static int topology_enabled = 1; +static bool topology_enabled = true; static DECLARE_WORK(topology_work, topology_work_fn); /* @@ -444,10 +444,7 @@ static const struct cpumask *cpu_book_mask(int cpu) static int __init early_parse_topology(char *p) { - if (strncmp(p, "off", 3)) - return 0; - topology_enabled = 0; - return 0; + return kstrtobool(p, &topology_enabled); } early_param("topology", early_parse_topology); diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 6e85f713641d..0a2bb1f62e72 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -227,19 +227,11 @@ static u32 __init search_agp_bridge(u32 *order, int *valid_agp) return 0; } -static int gart_fix_e820 __initdata = 1; +static bool gart_fix_e820 __initdata = true; static int __init parse_gart_mem(char *p) { - if (!p) - return -EINVAL; - - if (!strncmp(p, "off", 3)) - gart_fix_e820 = 0; - else if (!strncmp(p, "on", 2)) - gart_fix_e820 = 1; - - return 0; + return kstrtobool(p, &gart_fix_e820); } early_param("gart_fix_e820", parse_gart_mem); diff --git a/include/linux/tick.h b/include/linux/tick.h index 21f73649a4dc..62be0786d6d0 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -111,7 +111,7 @@ enum tick_dep_bits { #define TICK_DEP_MASK_CLOCK_UNSTABLE (1 << TICK_DEP_BIT_CLOCK_UNSTABLE) #ifdef CONFIG_NO_HZ_COMMON -extern int tick_nohz_enabled; +extern bool tick_nohz_enabled; extern int tick_nohz_tick_stopped(void); extern void tick_nohz_idle_enter(void); extern void tick_nohz_idle_exit(void); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 58a321c34cfb..fa0b983290cf 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -515,7 +515,7 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) /* * High resolution timer enabled ? */ -static int hrtimer_hres_enabled __read_mostly = 1; +static bool hrtimer_hres_enabled __read_mostly = true; unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC; EXPORT_SYMBOL_GPL(hrtimer_resolution); @@ -524,13 +524,7 @@ EXPORT_SYMBOL_GPL(hrtimer_resolution); */ static int __init setup_hrtimer_hres(char *str) { - if (!strcmp(str, "off")) - hrtimer_hres_enabled = 0; - else if (!strcmp(str, "on")) - hrtimer_hres_enabled = 1; - else - return 0; - return 1; + return (kstrtobool(str, &hrtimer_hres_enabled) == 0); } __setup("highres=", setup_hrtimer_hres); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 969e6704c3c9..195fe7d2caad 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -486,20 +486,14 @@ void __init tick_nohz_init(void) /* * NO HZ enabled ? */ -int tick_nohz_enabled __read_mostly = 1; +bool tick_nohz_enabled __read_mostly = true; unsigned long tick_nohz_active __read_mostly; /* * Enable / Disable tickless mode */ static int __init setup_tick_nohz(char *str) { - if (!strcmp(str, "off")) - tick_nohz_enabled = 0; - else if (!strcmp(str, "on")) - tick_nohz_enabled = 1; - else - return 0; - return 1; + return (kstrtobool(str, &tick_nohz_enabled) == 0); } __setup("nohz=", setup_tick_nohz); -- cgit v1.2.3-70-g09d2 From 2553b67a1fbe7bf202e4e8070ab0b00d3d3a06a2 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 17 Mar 2016 14:23:04 -0700 Subject: lib/bug.c: use common WARN helper The traceoff_on_warning option doesn't have any effect on s390, powerpc, arm64, parisc, and sh because there are two different types of WARN implementations: 1) The above mentioned architectures treat WARN() as a special case of a BUG() exception. They handle warnings in report_bug() in lib/bug.c. 2) All other architectures just call warn_slowpath_*() directly. Their warnings are handled in warn_slowpath_common() in kernel/panic.c. Support traceoff_on_warning on all architectures and prevent any future divergence by using a single common function to emit the warning. Also remove the '()' from '%pS()', because the parentheses look funky: [ 45.607629] WARNING: at /root/warn_mod/warn_mod.c:17 .init_dummy+0x20/0x40 [warn_mod]() Reported-by: Chunyu Hu Signed-off-by: Josh Poimboeuf Acked-by: Heiko Carstens Tested-by: Prarit Bhargava Acked-by: Prarit Bhargava Acked-by: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/bug.h | 6 ++++++ kernel/panic.c | 41 ++++++++++++++++++++++++++--------------- lib/bug.c | 26 ++------------------------ 3 files changed, 34 insertions(+), 39 deletions(-) (limited to 'include') diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index fdf6fa078422..f90588abbfd4 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -81,6 +81,12 @@ extern void warn_slowpath_null(const char *file, const int line); do { printk(arg); __WARN_TAINT(taint); } while (0) #endif +/* used internally by panic.c */ +struct warn_args; + +void __warn(const char *file, int line, void *caller, unsigned taint, + struct pt_regs *regs, struct warn_args *args); + #ifndef WARN_ON #define WARN_ON(condition) ({ \ int __ret_warn_on = !!(condition); \ diff --git a/kernel/panic.c b/kernel/panic.c index d96469de72dc..fa400852bf6c 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -24,6 +24,7 @@ #include #include #include +#include #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 @@ -449,20 +450,25 @@ void oops_exit(void) kmsg_dump(KMSG_DUMP_OOPS); } -#ifdef WANT_WARN_ON_SLOWPATH -struct slowpath_args { +struct warn_args { const char *fmt; va_list args; }; -static void warn_slowpath_common(const char *file, int line, void *caller, - unsigned taint, struct slowpath_args *args) +void __warn(const char *file, int line, void *caller, unsigned taint, + struct pt_regs *regs, struct warn_args *args) { disable_trace_on_warning(); pr_warn("------------[ cut here ]------------\n"); - pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS()\n", - raw_smp_processor_id(), current->pid, file, line, caller); + + if (file) + pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n", + raw_smp_processor_id(), current->pid, file, line, + caller); + else + pr_warn("WARNING: CPU: %d PID: %d at %pS\n", + raw_smp_processor_id(), current->pid, caller); if (args) vprintk(args->fmt, args->args); @@ -479,20 +485,27 @@ static void warn_slowpath_common(const char *file, int line, void *caller, } print_modules(); - dump_stack(); + + if (regs) + show_regs(regs); + else + dump_stack(); + print_oops_end_marker(); + /* Just a warning, don't kill lockdep. */ add_taint(taint, LOCKDEP_STILL_OK); } +#ifdef WANT_WARN_ON_SLOWPATH void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) { - struct slowpath_args args; + struct warn_args args; args.fmt = fmt; va_start(args.args, fmt); - warn_slowpath_common(file, line, __builtin_return_address(0), - TAINT_WARN, &args); + __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, + &args); va_end(args.args); } EXPORT_SYMBOL(warn_slowpath_fmt); @@ -500,20 +513,18 @@ EXPORT_SYMBOL(warn_slowpath_fmt); void warn_slowpath_fmt_taint(const char *file, int line, unsigned taint, const char *fmt, ...) { - struct slowpath_args args; + struct warn_args args; args.fmt = fmt; va_start(args.args, fmt); - warn_slowpath_common(file, line, __builtin_return_address(0), - taint, &args); + __warn(file, line, __builtin_return_address(0), taint, NULL, &args); va_end(args.args); } EXPORT_SYMBOL(warn_slowpath_fmt_taint); void warn_slowpath_null(const char *file, int line) { - warn_slowpath_common(file, line, __builtin_return_address(0), - TAINT_WARN, NULL); + __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, NULL); } EXPORT_SYMBOL(warn_slowpath_null); #endif diff --git a/lib/bug.c b/lib/bug.c index 6cde380f09de..bc3656e944d2 100644 --- a/lib/bug.c +++ b/lib/bug.c @@ -167,30 +167,8 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) if (warning) { /* this is a WARN_ON rather than BUG/BUG_ON */ - pr_warn("------------[ cut here ]------------\n"); - - if (file) - pr_warn("WARNING: at %s:%u\n", file, line); - else - pr_warn("WARNING: at %p [verbose debug info unavailable]\n", - (void *)bugaddr); - - if (panic_on_warn) { - /* - * This thread may hit another WARN() in the panic path. - * Resetting this prevents additional WARN() from - * panicking the system on this thread. Other threads - * are blocked by the panic_mutex in panic(). - */ - panic_on_warn = 0; - panic("panic_on_warn set ...\n"); - } - - print_modules(); - show_regs(regs); - print_oops_end_marker(); - /* Just a warning, don't kill lockdep. */ - add_taint(BUG_GET_TAINT(bug), LOCKDEP_STILL_OK); + __warn(file, line, (void *)bugaddr, BUG_GET_TAINT(bug), regs, + NULL); return BUG_TRAP_TYPE_WARN; } -- cgit v1.2.3-70-g09d2