summaryrefslogtreecommitdiff
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/core.c16
-rw-r--r--kernel/sched/debug.c1
-rw-r--r--kernel/sched/fair.c175
-rw-r--r--kernel/sched/psi.c280
-rw-r--r--kernel/sched/sched.h1
-rw-r--r--kernel/sched/stats.h6
6 files changed, 402 insertions, 77 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f4d02201f424..5800b0623ff3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -701,6 +701,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->prev_irq_time += irq_delta;
delta -= irq_delta;
+ psi_account_irqtime(rq->curr, irq_delta);
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
if (static_key_false((&paravirt_steal_rq_enabled))) {
@@ -4389,6 +4390,17 @@ void set_numabalancing_state(bool enabled)
}
#ifdef CONFIG_PROC_SYSCTL
+static void reset_memory_tiering(void)
+{
+ struct pglist_data *pgdat;
+
+ for_each_online_pgdat(pgdat) {
+ pgdat->nbp_threshold = 0;
+ pgdat->nbp_th_nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+ pgdat->nbp_th_start = jiffies_to_msecs(jiffies);
+ }
+}
+
int sysctl_numa_balancing(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
@@ -4405,6 +4417,9 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
if (err < 0)
return err;
if (write) {
+ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
+ (state & NUMA_BALANCING_MEMORY_TIERING))
+ reset_memory_tiering();
sysctl_numa_balancing_mode = state;
__set_numabalancing_state(state);
}
@@ -5159,6 +5174,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
* finish_task_switch()'s mmdrop().
*/
switch_mm_irqs_off(prev->active_mm, next->mm, next);
+ lru_gen_use_mm(next->mm);
if (!prev->mm) { // from kernel
/* will mmdrop() in finish_task_switch(). */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 667876da8382..1637b65ba07a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -333,6 +333,7 @@ static __init int sched_init_debug(void)
debugfs_create_u32("scan_period_min_ms", 0644, numa, &sysctl_numa_balancing_scan_period_min);
debugfs_create_u32("scan_period_max_ms", 0644, numa, &sysctl_numa_balancing_scan_period_max);
debugfs_create_u32("scan_size_mb", 0644, numa, &sysctl_numa_balancing_scan_size);
+ debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold);
#endif
debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5ffec4370602..e4a0b8bd941c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -40,6 +40,7 @@
#include <linux/cpuidle.h>
#include <linux/interrupt.h>
+#include <linux/memory-tiers.h>
#include <linux/mempolicy.h>
#include <linux/mutex_api.h>
#include <linux/profile.h>
@@ -1090,6 +1091,12 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
unsigned int sysctl_numa_balancing_scan_delay = 1000;
+/* The page with hint page fault latency < threshold in ms is considered hot */
+unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC;
+
+/* Restrict the NUMA promotion throughput (MB/s) for each target node. */
+unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
+
struct numa_group {
refcount_t refcount;
@@ -1432,6 +1439,120 @@ static inline unsigned long group_weight(struct task_struct *p, int nid,
return 1000 * faults / total_faults;
}
+/*
+ * If memory tiering mode is enabled, cpupid of slow memory page is
+ * used to record scan time instead of CPU and PID. When tiering mode
+ * is disabled at run time, the scan time (in cpupid) will be
+ * interpreted as CPU and PID. So CPU needs to be checked to avoid to
+ * access out of array bound.
+ */
+static inline bool cpupid_valid(int cpupid)
+{
+ return cpupid_to_cpu(cpupid) < nr_cpu_ids;
+}
+
+/*
+ * For memory tiering mode, if there are enough free pages (more than
+ * enough watermark defined here) in fast memory node, to take full
+ * advantage of fast memory capacity, all recently accessed slow
+ * memory pages will be migrated to fast memory node without
+ * considering hot threshold.
+ */
+static bool pgdat_free_space_enough(struct pglist_data *pgdat)
+{
+ int z;
+ unsigned long enough_wmark;
+
+ enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT,
+ pgdat->node_present_pages >> 4);
+ for (z = pgdat->nr_zones - 1; z >= 0; z--) {
+ struct zone *zone = pgdat->node_zones + z;
+
+ if (!populated_zone(zone))
+ continue;
+
+ if (zone_watermark_ok(zone, 0,
+ wmark_pages(zone, WMARK_PROMO) + enough_wmark,
+ ZONE_MOVABLE, 0))
+ return true;
+ }
+ return false;
+}
+
+/*
+ * For memory tiering mode, when page tables are scanned, the scan
+ * time will be recorded in struct page in addition to make page
+ * PROT_NONE for slow memory page. So when the page is accessed, in
+ * hint page fault handler, the hint page fault latency is calculated
+ * via,
+ *
+ * hint page fault latency = hint page fault time - scan time
+ *
+ * The smaller the hint page fault latency, the higher the possibility
+ * for the page to be hot.
+ */
+static int numa_hint_fault_latency(struct page *page)
+{
+ int last_time, time;
+
+ time = jiffies_to_msecs(jiffies);
+ last_time = xchg_page_access_time(page, time);
+
+ return (time - last_time) & PAGE_ACCESS_TIME_MASK;
+}
+
+/*
+ * For memory tiering mode, too high promotion/demotion throughput may
+ * hurt application latency. So we provide a mechanism to rate limit
+ * the number of pages that are tried to be promoted.
+ */
+static bool numa_promotion_rate_limit(struct pglist_data *pgdat,
+ unsigned long rate_limit, int nr)
+{
+ unsigned long nr_cand;
+ unsigned int now, start;
+
+ now = jiffies_to_msecs(jiffies);
+ mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr);
+ nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+ start = pgdat->nbp_rl_start;
+ if (now - start > MSEC_PER_SEC &&
+ cmpxchg(&pgdat->nbp_rl_start, start, now) == start)
+ pgdat->nbp_rl_nr_cand = nr_cand;
+ if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit)
+ return true;
+ return false;
+}
+
+#define NUMA_MIGRATION_ADJUST_STEPS 16
+
+static void numa_promotion_adjust_threshold(struct pglist_data *pgdat,
+ unsigned long rate_limit,
+ unsigned int ref_th)
+{
+ unsigned int now, start, th_period, unit_th, th;
+ unsigned long nr_cand, ref_cand, diff_cand;
+
+ now = jiffies_to_msecs(jiffies);
+ th_period = sysctl_numa_balancing_scan_period_max;
+ start = pgdat->nbp_th_start;
+ if (now - start > th_period &&
+ cmpxchg(&pgdat->nbp_th_start, start, now) == start) {
+ ref_cand = rate_limit *
+ sysctl_numa_balancing_scan_period_max / MSEC_PER_SEC;
+ nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+ diff_cand = nr_cand - pgdat->nbp_th_nr_cand;
+ unit_th = ref_th * 2 / NUMA_MIGRATION_ADJUST_STEPS;
+ th = pgdat->nbp_threshold ? : ref_th;
+ if (diff_cand > ref_cand * 11 / 10)
+ th = max(th - unit_th, unit_th);
+ else if (diff_cand < ref_cand * 9 / 10)
+ th = min(th + unit_th, ref_th * 2);
+ pgdat->nbp_th_nr_cand = nr_cand;
+ pgdat->nbp_threshold = th;
+ }
+}
+
bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
int src_nid, int dst_cpu)
{
@@ -1439,9 +1560,44 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
int dst_nid = cpu_to_node(dst_cpu);
int last_cpupid, this_cpupid;
+ /*
+ * The pages in slow memory node should be migrated according
+ * to hot/cold instead of private/shared.
+ */
+ if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
+ !node_is_toptier(src_nid)) {
+ struct pglist_data *pgdat;
+ unsigned long rate_limit;
+ unsigned int latency, th, def_th;
+
+ pgdat = NODE_DATA(dst_nid);
+ if (pgdat_free_space_enough(pgdat)) {
+ /* workload changed, reset hot threshold */
+ pgdat->nbp_threshold = 0;
+ return true;
+ }
+
+ def_th = sysctl_numa_balancing_hot_threshold;
+ rate_limit = sysctl_numa_balancing_promote_rate_limit << \
+ (20 - PAGE_SHIFT);
+ numa_promotion_adjust_threshold(pgdat, rate_limit, def_th);
+
+ th = pgdat->nbp_threshold ? : def_th;
+ latency = numa_hint_fault_latency(page);
+ if (latency >= th)
+ return false;
+
+ return !numa_promotion_rate_limit(pgdat, rate_limit,
+ thp_nr_pages(page));
+ }
+
this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
+ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
+ !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid))
+ return false;
+
/*
* Allow first faults or private faults to migrate immediately early in
* the lifetime of a task. The magic number 4 is based on waiting for
@@ -2681,6 +2837,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
if (!p->mm)
return;
+ /*
+ * NUMA faults statistics are unnecessary for the slow memory
+ * node for memory tiering mode.
+ */
+ if (!node_is_toptier(mem_node) &&
+ (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING ||
+ !cpupid_valid(last_cpupid)))
+ return;
+
/* Allocate buffer to track faults on a per-node basis */
if (unlikely(!p->numa_faults)) {
int size = sizeof(*p->numa_faults) *
@@ -2761,6 +2926,7 @@ static void task_numa_work(struct callback_head *work)
struct task_struct *p = current;
struct mm_struct *mm = p->mm;
u64 runtime = p->se.sum_exec_runtime;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
struct vm_area_struct *vma;
unsigned long start, end;
unsigned long nr_pte_updates = 0;
@@ -2817,13 +2983,16 @@ static void task_numa_work(struct callback_head *work)
if (!mmap_read_trylock(mm))
return;
- vma = find_vma(mm, start);
+ mas_set(&mas, start);
+ vma = mas_find(&mas, ULONG_MAX);
if (!vma) {
reset_ptenuma_scan(p);
start = 0;
- vma = mm->mmap;
+ mas_set(&mas, start);
+ vma = mas_find(&mas, ULONG_MAX);
}
- for (; vma; vma = vma->vm_next) {
+
+ for (; vma; vma = mas_find(&mas, ULONG_MAX)) {
if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
continue;
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 7f6030091aee..ee2ecc081422 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -181,6 +181,7 @@ static void group_init(struct psi_group *group)
{
int cpu;
+ group->enabled = true;
for_each_possible_cpu(cpu)
seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
group->avg_last_update = sched_clock();
@@ -201,6 +202,7 @@ void __init psi_init(void)
{
if (!psi_enable) {
static_branch_enable(&psi_disabled);
+ static_branch_disable(&psi_cgroups_enabled);
return;
}
@@ -211,7 +213,7 @@ void __init psi_init(void)
group_init(&psi_system);
}
-static bool test_state(unsigned int *tasks, enum psi_states state)
+static bool test_state(unsigned int *tasks, enum psi_states state, bool oncpu)
{
switch (state) {
case PSI_IO_SOME:
@@ -224,9 +226,9 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
return unlikely(tasks[NR_MEMSTALL] &&
tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
case PSI_CPU_SOME:
- return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
+ return unlikely(tasks[NR_RUNNING] > oncpu);
case PSI_CPU_FULL:
- return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]);
+ return unlikely(tasks[NR_RUNNING] && !oncpu);
case PSI_NONIDLE:
return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
tasks[NR_RUNNING];
@@ -688,35 +690,53 @@ static void psi_group_change(struct psi_group *group, int cpu,
bool wake_clock)
{
struct psi_group_cpu *groupc;
- u32 state_mask = 0;
unsigned int t, m;
enum psi_states s;
+ u32 state_mask;
groupc = per_cpu_ptr(group->pcpu, cpu);
/*
- * First we assess the aggregate resource states this CPU's
- * tasks have been in since the last change, and account any
- * SOME and FULL time these may have resulted in.
- *
- * Then we update the task counts according to the state
+ * First we update the task counts according to the state
* change requested through the @clear and @set bits.
+ *
+ * Then if the cgroup PSI stats accounting enabled, we
+ * assess the aggregate resource states this CPU's tasks
+ * have been in since the last change, and account any
+ * SOME and FULL time these may have resulted in.
*/
write_seqcount_begin(&groupc->seq);
- record_times(groupc, now);
+ /*
+ * Start with TSK_ONCPU, which doesn't have a corresponding
+ * task count - it's just a boolean flag directly encoded in
+ * the state mask. Clear, set, or carry the current state if
+ * no changes are requested.
+ */
+ if (unlikely(clear & TSK_ONCPU)) {
+ state_mask = 0;
+ clear &= ~TSK_ONCPU;
+ } else if (unlikely(set & TSK_ONCPU)) {
+ state_mask = PSI_ONCPU;
+ set &= ~TSK_ONCPU;
+ } else {
+ state_mask = groupc->state_mask & PSI_ONCPU;
+ }
+ /*
+ * The rest of the state mask is calculated based on the task
+ * counts. Update those first, then construct the mask.
+ */
for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
if (!(m & (1 << t)))
continue;
if (groupc->tasks[t]) {
groupc->tasks[t]--;
} else if (!psi_bug) {
- printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n",
+ printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
cpu, t, groupc->tasks[0],
groupc->tasks[1], groupc->tasks[2],
- groupc->tasks[3], groupc->tasks[4],
- clear, set);
+ groupc->tasks[3], clear, set);
psi_bug = 1;
}
}
@@ -725,9 +745,25 @@ static void psi_group_change(struct psi_group *group, int cpu,
if (set & (1 << t))
groupc->tasks[t]++;
- /* Calculate state mask representing active states */
+ if (!group->enabled) {
+ /*
+ * On the first group change after disabling PSI, conclude
+ * the current state and flush its time. This is unlikely
+ * to matter to the user, but aggregation (get_recent_times)
+ * may have already incorporated the live state into times_prev;
+ * avoid a delta sample underflow when PSI is later re-enabled.
+ */
+ if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE)))
+ record_times(groupc, now);
+
+ groupc->state_mask = state_mask;
+
+ write_seqcount_end(&groupc->seq);
+ return;
+ }
+
for (s = 0; s < NR_PSI_STATES; s++) {
- if (test_state(groupc->tasks, s))
+ if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU))
state_mask |= (1 << s);
}
@@ -739,9 +775,11 @@ static void psi_group_change(struct psi_group *group, int cpu,
* task in a cgroup is in_memstall, the corresponding groupc
* on that cpu is in PSI_MEM_FULL state.
*/
- if (unlikely(groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall))
+ if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall))
state_mask |= (1 << PSI_MEM_FULL);
+ record_times(groupc, now);
+
groupc->state_mask = state_mask;
write_seqcount_end(&groupc->seq);
@@ -753,27 +791,12 @@ static void psi_group_change(struct psi_group *group, int cpu,
schedule_delayed_work(&group->avgs_work, PSI_FREQ);
}
-static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
+static inline struct psi_group *task_psi_group(struct task_struct *task)
{
- if (*iter == &psi_system)
- return NULL;
-
#ifdef CONFIG_CGROUPS
- if (static_branch_likely(&psi_cgroups_enabled)) {
- struct cgroup *cgroup = NULL;
-
- if (!*iter)
- cgroup = task->cgroups->dfl_cgrp;
- else
- cgroup = cgroup_parent(*iter);
-
- if (cgroup && cgroup_parent(cgroup)) {
- *iter = cgroup;
- return cgroup_psi(cgroup);
- }
- }
+ if (static_branch_likely(&psi_cgroups_enabled))
+ return cgroup_psi(task_dfl_cgroup(task));
#endif
- *iter = &psi_system;
return &psi_system;
}
@@ -796,8 +819,6 @@ void psi_task_change(struct task_struct *task, int clear, int set)
{
int cpu = task_cpu(task);
struct psi_group *group;
- bool wake_clock = true;
- void *iter = NULL;
u64 now;
if (!task->pid)
@@ -806,19 +827,11 @@ void psi_task_change(struct task_struct *task, int clear, int set)
psi_flags_change(task, clear, set);
now = cpu_clock(cpu);
- /*
- * Periodic aggregation shuts off if there is a period of no
- * task changes, so we wake it back up if necessary. However,
- * don't do this if the task change is the aggregation worker
- * itself going to sleep, or we'll ping-pong forever.
- */
- if (unlikely((clear & TSK_RUNNING) &&
- (task->flags & PF_WQ_WORKER) &&
- wq_worker_last_func(task) == psi_avgs_work))
- wake_clock = false;
- while ((group = iterate_groups(task, &iter)))
- psi_group_change(group, cpu, clear, set, now, wake_clock);
+ group = task_psi_group(task);
+ do {
+ psi_group_change(group, cpu, clear, set, now, true);
+ } while ((group = group->parent));
}
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
@@ -826,34 +839,30 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
{
struct psi_group *group, *common = NULL;
int cpu = task_cpu(prev);
- void *iter;
u64 now = cpu_clock(cpu);
if (next->pid) {
- bool identical_state;
-
psi_flags_change(next, 0, TSK_ONCPU);
/*
- * When switching between tasks that have an identical
- * runtime state, the cgroup that contains both tasks
- * we reach the first common ancestor. Iterate @next's
- * ancestors only until we encounter @prev's ONCPU.
+ * Set TSK_ONCPU on @next's cgroups. If @next shares any
+ * ancestors with @prev, those will already have @prev's
+ * TSK_ONCPU bit set, and we can stop the iteration there.
*/
- identical_state = prev->psi_flags == next->psi_flags;
- iter = NULL;
- while ((group = iterate_groups(next, &iter))) {
- if (identical_state &&
- per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
+ group = task_psi_group(next);
+ do {
+ if (per_cpu_ptr(group->pcpu, cpu)->state_mask &
+ PSI_ONCPU) {
common = group;
break;
}
psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
- }
+ } while ((group = group->parent));
}
if (prev->pid) {
int clear = TSK_ONCPU, set = 0;
+ bool wake_clock = true;
/*
* When we're going to sleep, psi_dequeue() lets us
@@ -867,26 +876,74 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
clear |= TSK_MEMSTALL_RUNNING;
if (prev->in_iowait)
set |= TSK_IOWAIT;
+
+ /*
+ * Periodic aggregation shuts off if there is a period of no
+ * task changes, so we wake it back up if necessary. However,
+ * don't do this if the task change is the aggregation worker
+ * itself going to sleep, or we'll ping-pong forever.
+ */
+ if (unlikely((prev->flags & PF_WQ_WORKER) &&
+ wq_worker_last_func(prev) == psi_avgs_work))
+ wake_clock = false;
}
psi_flags_change(prev, clear, set);
- iter = NULL;
- while ((group = iterate_groups(prev, &iter)) && group != common)
- psi_group_change(group, cpu, clear, set, now, true);
+ group = task_psi_group(prev);
+ do {
+ if (group == common)
+ break;
+ psi_group_change(group, cpu, clear, set, now, wake_clock);
+ } while ((group = group->parent));
/*
- * TSK_ONCPU is handled up to the common ancestor. If we're tasked
- * with dequeuing too, finish that for the rest of the hierarchy.
+ * TSK_ONCPU is handled up to the common ancestor. If there are
+ * any other differences between the two tasks (e.g. prev goes
+ * to sleep, or only one task is memstall), finish propagating
+ * those differences all the way up to the root.
*/
- if (sleep) {
+ if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) {
clear &= ~TSK_ONCPU;
- for (; group; group = iterate_groups(prev, &iter))
- psi_group_change(group, cpu, clear, set, now, true);
+ for (; group; group = group->parent)
+ psi_group_change(group, cpu, clear, set, now, wake_clock);
}
}
}
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+void psi_account_irqtime(struct task_struct *task, u32 delta)
+{
+ int cpu = task_cpu(task);
+ struct psi_group *group;
+ struct psi_group_cpu *groupc;
+ u64 now;
+
+ if (!task->pid)
+ return;
+
+ now = cpu_clock(cpu);
+
+ group = task_psi_group(task);
+ do {
+ if (!group->enabled)
+ continue;
+
+ groupc = per_cpu_ptr(group->pcpu, cpu);
+
+ write_seqcount_begin(&groupc->seq);
+
+ record_times(groupc, now);
+ groupc->times[PSI_IRQ_FULL] += delta;
+
+ write_seqcount_end(&groupc->seq);
+
+ if (group->poll_states & (1 << PSI_IRQ_FULL))
+ psi_schedule_poll_work(group, 1);
+ } while ((group = group->parent));
+}
+#endif
+
/**
* psi_memstall_enter - mark the beginning of a memory stall section
* @flags: flags to handle nested sections
@@ -952,7 +1009,7 @@ EXPORT_SYMBOL_GPL(psi_memstall_leave);
#ifdef CONFIG_CGROUPS
int psi_cgroup_alloc(struct cgroup *cgroup)
{
- if (static_branch_likely(&psi_disabled))
+ if (!static_branch_likely(&psi_cgroups_enabled))
return 0;
cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL);
@@ -965,12 +1022,13 @@ int psi_cgroup_alloc(struct cgroup *cgroup)
return -ENOMEM;
}
group_init(cgroup->psi);
+ cgroup->psi->parent = cgroup_psi(cgroup_parent(cgroup));
return 0;
}
void psi_cgroup_free(struct cgroup *cgroup)
{
- if (static_branch_likely(&psi_disabled))
+ if (!static_branch_likely(&psi_cgroups_enabled))
return;
cancel_delayed_work_sync(&cgroup->psi->avgs_work);
@@ -998,7 +1056,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
struct rq_flags rf;
struct rq *rq;
- if (static_branch_likely(&psi_disabled)) {
+ if (!static_branch_likely(&psi_cgroups_enabled)) {
/*
* Lame to do this here, but the scheduler cannot be locked
* from the outside, so we move cgroups from inside sched/.
@@ -1046,10 +1104,45 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
task_rq_unlock(rq, task, &rf);
}
+
+void psi_cgroup_restart(struct psi_group *group)
+{
+ int cpu;
+
+ /*
+ * After we disable psi_group->enabled, we don't actually
+ * stop percpu tasks accounting in each psi_group_cpu,
+ * instead only stop test_state() loop, record_times()
+ * and averaging worker, see psi_group_change() for details.
+ *
+ * When disable cgroup PSI, this function has nothing to sync
+ * since cgroup pressure files are hidden and percpu psi_group_cpu
+ * would see !psi_group->enabled and only do task accounting.
+ *
+ * When re-enable cgroup PSI, this function use psi_group_change()
+ * to get correct state mask from test_state() loop on tasks[],
+ * and restart groupc->state_start from now, use .clear = .set = 0
+ * here since no task status really changed.
+ */
+ if (!group->enabled)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
+ u64 now;
+
+ rq_lock_irq(rq, &rf);
+ now = cpu_clock(cpu);
+ psi_group_change(group, cpu, 0, 0, now, true);
+ rq_unlock_irq(rq, &rf);
+ }
+}
#endif /* CONFIG_CGROUPS */
int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
{
+ bool only_full = false;
int full;
u64 now;
@@ -1064,7 +1157,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
group->avg_next_update = update_averages(group, now);
mutex_unlock(&group->avgs_lock);
- for (full = 0; full < 2; full++) {
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ only_full = res == PSI_IRQ;
+#endif
+
+ for (full = 0; full < 2 - only_full; full++) {
unsigned long avg[3] = { 0, };
u64 total = 0;
int w;
@@ -1078,7 +1175,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
}
seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
- full ? "full" : "some",
+ full || only_full ? "full" : "some",
LOAD_INT(avg[0]), LOAD_FRAC(avg[0]),
LOAD_INT(avg[1]), LOAD_FRAC(avg[1]),
LOAD_INT(avg[2]), LOAD_FRAC(avg[2]),
@@ -1106,6 +1203,11 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
else
return ERR_PTR(-EINVAL);
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ if (res == PSI_IRQ && --state != PSI_IRQ_FULL)
+ return ERR_PTR(-EINVAL);
+#endif
+
if (state >= PSI_NONIDLE)
return ERR_PTR(-EINVAL);
@@ -1390,6 +1492,33 @@ static const struct proc_ops psi_cpu_proc_ops = {
.proc_release = psi_fop_release,
};
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+static int psi_irq_show(struct seq_file *m, void *v)
+{
+ return psi_show(m, &psi_system, PSI_IRQ);
+}
+
+static int psi_irq_open(struct inode *inode, struct file *file)
+{
+ return psi_open(file, psi_irq_show);
+}
+
+static ssize_t psi_irq_write(struct file *file, const char __user *user_buf,
+ size_t nbytes, loff_t *ppos)
+{
+ return psi_write(file, user_buf, nbytes, PSI_IRQ);
+}
+
+static const struct proc_ops psi_irq_proc_ops = {
+ .proc_open = psi_irq_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = psi_irq_write,
+ .proc_poll = psi_fop_poll,
+ .proc_release = psi_fop_release,
+};
+#endif
+
static int __init psi_proc_init(void)
{
if (psi_enable) {
@@ -1397,6 +1526,9 @@ static int __init psi_proc_init(void)
proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops);
proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops);
proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops);
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ proc_create("pressure/irq", 0666, NULL, &psi_irq_proc_ops);
+#endif
}
return 0;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1fc198be1ffd..1644242ecd11 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2446,6 +2446,7 @@ extern unsigned int sysctl_numa_balancing_scan_delay;
extern unsigned int sysctl_numa_balancing_scan_period_min;
extern unsigned int sysctl_numa_balancing_scan_period_max;
extern unsigned int sysctl_numa_balancing_scan_size;
+extern unsigned int sysctl_numa_balancing_hot_threshold;
#endif
#ifdef CONFIG_SCHED_HRTICK
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index baa839c1ba96..84a188913cc9 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -107,6 +107,11 @@ __schedstats_from_se(struct sched_entity *se)
}
#ifdef CONFIG_PSI
+void psi_task_change(struct task_struct *task, int clear, int set);
+void psi_task_switch(struct task_struct *prev, struct task_struct *next,
+ bool sleep);
+void psi_account_irqtime(struct task_struct *task, u32 delta);
+
/*
* PSI tracks state that persists across sleeps, such as iowaits and
* memory stalls. As a result, it has to distinguish between sleeps,
@@ -201,6 +206,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
static inline void psi_sched_switch(struct task_struct *prev,
struct task_struct *next,
bool sleep) {}
+static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {}
#endif /* CONFIG_PSI */
#ifdef CONFIG_SCHED_INFO