summaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2017-09-25 08:12:05 -0700
committerTejun Heo <tj@kernel.org>2017-09-25 08:12:05 -0700
commit041cd640b2f3c5607171c59d8712b503659d21f7 (patch)
tree2979112393aefa10e23245ae95f481763280dd6f /include/linux
parentd2cc5ed6949085cfba30ec5228816cf6eb1d02b9 (diff)
cgroup: Implement cgroup2 basic CPU usage accounting
In cgroup1, while cpuacct isn't actually controlling any resources, it is a separate controller due to combination of two factors - 1. enabling cpu controller has significant side effects, and 2. we have to pick one of the hierarchies to account CPU usages on. cpuacct controller is effectively used to designate a hierarchy to track CPU usages on. cgroup2's unified hierarchy removes the second reason and we can account basic CPU usages by default. While we can use cpuacct for this purpose, both its interface and implementation leave a lot to be desired - it collects and exposes two sources of truth which don't agree with each other and some of the exposed statistics don't make much sense. Also, it propagates all the way up the hierarchy on each accounting event which is unnecessary. This patch adds basic resource accounting mechanism to cgroup2's unified hierarchy and accounts CPU usages using it. * All accountings are done per-cpu and don't propagate immediately. It just bumps the per-cgroup per-cpu counters and links to the parent's updated list if not already on it. * On a read, the per-cpu counters are collected into the global ones and then propagated upwards. Only the per-cpu counters which have changed since the last read are propagated. * CPU usage stats are collected and shown in "cgroup.stat" with "cpu." prefix. Total usage is collected from scheduling events. User/sys breakdown is sourced from tick sampling and adjusted to the usage using cputime_adjust(). This keeps the accounting side hot path O(1) and per-cpu and the read side O(nr_updated_since_last_read). v2: Minor changes and documentation updates as suggested by Waiman and Roman. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Peter Zijlstra <peterz@infradead.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Li Zefan <lizefan@huawei.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Waiman Long <longman@redhat.com> Cc: Roman Gushchin <guro@fb.com>
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/cgroup-defs.h57
-rw-r--r--include/linux/cgroup.h22
2 files changed, 79 insertions, 0 deletions
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index ade4a78a54c2..3e55bbd31ad1 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -16,6 +16,7 @@
#include <linux/refcount.h>
#include <linux/percpu-refcount.h>
#include <linux/percpu-rwsem.h>
+#include <linux/u64_stats_sync.h>
#include <linux/workqueue.h>
#include <linux/bpf-cgroup.h>
@@ -254,6 +255,57 @@ struct css_set {
struct rcu_head rcu_head;
};
+/*
+ * cgroup basic resource usage statistics. Accounting is done per-cpu in
+ * cgroup_cpu_stat which is then lazily propagated up the hierarchy on
+ * reads.
+ *
+ * When a stat gets updated, the cgroup_cpu_stat and its ancestors are
+ * linked into the updated tree. On the following read, propagation only
+ * considers and consumes the updated tree. This makes reading O(the
+ * number of descendants which have been active since last read) instead of
+ * O(the total number of descendants).
+ *
+ * This is important because there can be a lot of (draining) cgroups which
+ * aren't active and stat may be read frequently. The combination can
+ * become very expensive. By propagating selectively, increasing reading
+ * frequency decreases the cost of each read.
+ */
+struct cgroup_cpu_stat {
+ /*
+ * ->sync protects all the current counters. These are the only
+ * fields which get updated in the hot path.
+ */
+ struct u64_stats_sync sync;
+ struct task_cputime cputime;
+
+ /*
+ * Snapshots at the last reading. These are used to calculate the
+ * deltas to propagate to the global counters.
+ */
+ struct task_cputime last_cputime;
+
+ /*
+ * Child cgroups with stat updates on this cpu since the last read
+ * are linked on the parent's ->updated_children through
+ * ->updated_next.
+ *
+ * In addition to being more compact, singly-linked list pointing
+ * to the cgroup makes it unnecessary for each per-cpu struct to
+ * point back to the associated cgroup.
+ *
+ * Protected by per-cpu cgroup_cpu_stat_lock.
+ */
+ struct cgroup *updated_children; /* terminated by self cgroup */
+ struct cgroup *updated_next; /* NULL iff not on the list */
+};
+
+struct cgroup_stat {
+ /* per-cpu statistics are collected into the folowing global counters */
+ struct task_cputime cputime;
+ struct prev_cputime prev_cputime;
+};
+
struct cgroup {
/* self css with NULL ->ss, points back to this cgroup */
struct cgroup_subsys_state self;
@@ -353,6 +405,11 @@ struct cgroup {
*/
struct cgroup *dom_cgrp;
+ /* cgroup basic resource statistics */
+ struct cgroup_cpu_stat __percpu *cpu_stat;
+ struct cgroup_stat pending_stat; /* pending from children */
+ struct cgroup_stat stat;
+
/*
* list of pidlists, up to two for each namespace (one for procs, one
* for tasks); created on demand.
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 6cd579329310..328a70ce0e23 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -703,17 +703,39 @@ static inline void cpuacct_account_field(struct task_struct *tsk, int index,
u64 val) {}
#endif
+void cgroup_stat_show_cputime(struct seq_file *seq, const char *prefix);
+
+void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec);
+void __cgroup_account_cputime_field(struct cgroup *cgrp,
+ enum cpu_usage_stat index, u64 delta_exec);
+
static inline void cgroup_account_cputime(struct task_struct *task,
u64 delta_exec)
{
+ struct cgroup *cgrp;
+
cpuacct_charge(task, delta_exec);
+
+ rcu_read_lock();
+ cgrp = task_dfl_cgroup(task);
+ if (cgroup_parent(cgrp))
+ __cgroup_account_cputime(cgrp, delta_exec);
+ rcu_read_unlock();
}
static inline void cgroup_account_cputime_field(struct task_struct *task,
enum cpu_usage_stat index,
u64 delta_exec)
{
+ struct cgroup *cgrp;
+
cpuacct_account_field(task, index, delta_exec);
+
+ rcu_read_lock();
+ cgrp = task_dfl_cgroup(task);
+ if (cgroup_parent(cgrp))
+ __cgroup_account_cputime_field(cgrp, index, delta_exec);
+ rcu_read_unlock();
}
#else /* CONFIG_CGROUPS */