From d7e0b37a87c39f5c02dd7b5d55c7a3ec2f65b943 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 12 Nov 2013 15:08:12 -0800 Subject: mm: set N_CPU to node_states during boot After a system booted, N_CPU is not set to any node as has_cpu shows an empty line. # cat /sys/devices/system/node/has_cpu (show-empty-line) setup_vmstat() registers its CPU notifier callback, vmstat_cpuup_callback(), which marks N_CPU to a node when a CPU is put into online. However, setup_vmstat() is called after all CPUs are launched in the boot sequence. Changed setup_vmstat() to mark N_CPU to the nodes with online CPUs at boot, which is consistent with other operations in vmstat_cpuup_callback(), i.e. start_cpu_timer() and refresh_zone_stat_thresholds(). Also added get_online_cpus() to protect the for_each_online_cpu() loop. Signed-off-by: Toshi Kani Acked-by: Christoph Lameter Reviewed-by: Yasuaki Ishimatsu Tested-by: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'mm/vmstat.c') diff --git a/mm/vmstat.c b/mm/vmstat.c index 9bb314577911..0a1f7de972b3 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1276,8 +1276,12 @@ static int __init setup_vmstat(void) register_cpu_notifier(&vmstat_notifier); - for_each_online_cpu(cpu) + get_online_cpus(); + for_each_online_cpu(cpu) { start_cpu_timer(cpu); + node_set_state(cpu_to_node(cpu), N_CPU); + } + put_online_cpus(); #endif #ifdef CONFIG_PROC_FS proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); -- cgit v1.2.3-70-g09d2 From 807a1bd2b2a38845fd422b93328e7d69f13eb13a Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 12 Nov 2013 15:08:13 -0800 Subject: mm: clear N_CPU from node_states at CPU offline vmstat_cpuup_callback() is a CPU notifier callback, which marks N_CPU to a node at CPU online event. However, it does not update this N_CPU info at CPU offline event. Changed vmstat_cpuup_callback() to clear N_CPU when the last CPU in the node is put into offline, i.e. the node no longer has any online CPU. Signed-off-by: Toshi Kani Acked-by: Christoph Lameter Reviewed-by: Yasuaki Ishimatsu Tested-by: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'mm/vmstat.c') diff --git a/mm/vmstat.c b/mm/vmstat.c index 0a1f7de972b3..b6d17edf8cf3 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1229,6 +1229,20 @@ static void start_cpu_timer(int cpu) schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu)); } +static void vmstat_cpu_dead(int node) +{ + int cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) + if (cpu_to_node(cpu) == node) + goto end; + + node_clear_state(node, N_CPU); +end: + put_online_cpus(); +} + /* * Use the cpu notifier to insure that the thresholds are recalculated * when necessary. @@ -1258,6 +1272,7 @@ static int vmstat_cpuup_callback(struct notifier_block *nfb, case CPU_DEAD: case CPU_DEAD_FROZEN: refresh_zone_stat_thresholds(); + vmstat_cpu_dead(cpu_to_node(cpu)); break; default: break; -- cgit v1.2.3-70-g09d2 From 72403b4a0fbdf433c1fe0127e49864658f6f6468 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 12 Nov 2013 15:08:32 -0800 Subject: mm: numa: return the number of base pages altered by protection changes Commit 0255d4918480 ("mm: Account for a THP NUMA hinting update as one PTE update") was added to account for the number of PTE updates when marking pages prot_numa. task_numa_work was using the old return value to track how much address space had been updated. Altering the return value causes the scanner to do more work than it is configured or documented to in a single unit of work. This patch reverts that commit and accounts for the number of THP updates separately in vmstat. It is up to the administrator to interpret the pair of values correctly. This is a straight-forward operation and likely to only be of interest when actively debugging NUMA balancing problems. The impact of this patch is that the NUMA PTE scanner will scan slower when THP is enabled and workloads may converge slower as a result. On the flip size system CPU usage should be lower than recent tests reported. This is an illustrative example of a short single JVM specjbb test specjbb 3.12.0 3.12.0 vanilla acctupdates TPut 1 26143.00 ( 0.00%) 25747.00 ( -1.51%) TPut 7 185257.00 ( 0.00%) 183202.00 ( -1.11%) TPut 13 329760.00 ( 0.00%) 346577.00 ( 5.10%) TPut 19 442502.00 ( 0.00%) 460146.00 ( 3.99%) TPut 25 540634.00 ( 0.00%) 549053.00 ( 1.56%) TPut 31 512098.00 ( 0.00%) 519611.00 ( 1.47%) TPut 37 461276.00 ( 0.00%) 474973.00 ( 2.97%) TPut 43 403089.00 ( 0.00%) 414172.00 ( 2.75%) 3.12.0 3.12.0 vanillaacctupdates User 5169.64 5184.14 System 100.45 80.02 Elapsed 252.75 251.85 Performance is similar but note the reduction in system CPU time. While this showed a performance gain, it will not be universal but at least it'll be behaving as documented. The vmstats are obviously different but here is an obvious interpretation of them from mmtests. 3.12.0 3.12.0 vanillaacctupdates NUMA page range updates 1408326 11043064 NUMA huge PMD updates 0 21040 NUMA PTE updates 1408326 291624 "NUMA page range updates" == nr_pte_updates and is the value returned to the NUMA pte scanner. NUMA huge PMD updates were the number of THP updates which in combination can be used to calculate how many ptes were updated from userspace. Signed-off-by: Mel Gorman Reported-by: Alex Thorlton Reviewed-by: Rik van Riel Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vm_event_item.h | 1 + mm/mprotect.c | 10 +++++++--- mm/vmstat.c | 1 + 3 files changed, 9 insertions(+), 3 deletions(-) (limited to 'mm/vmstat.c') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 1855f0a22add..c557c6d096de 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -39,6 +39,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, PAGEOUTRUN, ALLOCSTALL, PGROTATED, #ifdef CONFIG_NUMA_BALANCING NUMA_PTE_UPDATES, + NUMA_HUGE_PTE_UPDATES, NUMA_HINT_FAULTS, NUMA_HINT_FAULTS_LOCAL, NUMA_PAGE_MIGRATE, diff --git a/mm/mprotect.c b/mm/mprotect.c index a597f2ffcd6f..26667971c824 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -112,6 +112,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pmd_t *pmd; unsigned long next; unsigned long pages = 0; + unsigned long nr_huge_updates = 0; pmd = pmd_offset(pud, addr); do { @@ -126,9 +127,10 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, newprot, prot_numa); if (nr_ptes) { - if (nr_ptes == HPAGE_PMD_NR) - pages++; - + if (nr_ptes == HPAGE_PMD_NR) { + pages += HPAGE_PMD_NR; + nr_huge_updates++; + } continue; } } @@ -141,6 +143,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pages += this_pages; } while (pmd++, addr = next, addr != end); + if (nr_huge_updates) + count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates); return pages; } diff --git a/mm/vmstat.c b/mm/vmstat.c index b6d17edf8cf3..72496140ac08 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -812,6 +812,7 @@ const char * const vmstat_text[] = { #ifdef CONFIG_NUMA_BALANCING "numa_pte_updates", + "numa_huge_pte_updates", "numa_hint_faults", "numa_hint_faults_local", "numa_pages_migrated", -- cgit v1.2.3-70-g09d2