diff options
| author | Tejun Heo <tj@kernel.org> | 2019-08-26 09:06:56 -0700 | 
|---|---|---|
| committer | Jens Axboe <axboe@kernel.dk> | 2019-08-27 09:22:38 -0600 | 
| commit | 97b27821b4854ca744946dae32a3f2fd55bcd5bc (patch) | |
| tree | a89a3f0e814f684b87f9dd2198818d351b52c8aa /mm | |
| parent | d62241c7a406f0680d702bd974f6f17e28ab8e5d (diff) | |
writeback, memcg: Implement foreign dirty flushing
There's an inherent mismatch between memcg and writeback.  The former
trackes ownership per-page while the latter per-inode.  This was a
deliberate design decision because honoring per-page ownership in the
writeback path is complicated, may lead to higher CPU and IO overheads
and deemed unnecessary given that write-sharing an inode across
different cgroups isn't a common use-case.
Combined with inode majority-writer ownership switching, this works
well enough in most cases but there are some pathological cases.  For
example, let's say there are two cgroups A and B which keep writing to
different but confined parts of the same inode.  B owns the inode and
A's memory is limited far below B's.  A's dirty ratio can rise enough
to trigger balance_dirty_pages() sleeps but B's can be low enough to
avoid triggering background writeback.  A will be slowed down without
a way to make writeback of the dirty pages happen.
This patch implements foreign dirty recording and foreign mechanism so
that when a memcg encounters a condition as above it can trigger
flushes on bdi_writebacks which can clean its pages.  Please see the
comment on top of mem_cgroup_track_foreign_dirty_slowpath() for
details.
A reproducer follows.
write-range.c::
  #include <stdio.h>
  #include <stdlib.h>
  #include <unistd.h>
  #include <fcntl.h>
  #include <sys/types.h>
  static const char *usage = "write-range FILE START SIZE\n";
  int main(int argc, char **argv)
  {
	  int fd;
	  unsigned long start, size, end, pos;
	  char *endp;
	  char buf[4096];
	  if (argc < 4) {
		  fprintf(stderr, usage);
		  return 1;
	  }
	  fd = open(argv[1], O_WRONLY);
	  if (fd < 0) {
		  perror("open");
		  return 1;
	  }
	  start = strtoul(argv[2], &endp, 0);
	  if (*endp != '\0') {
		  fprintf(stderr, usage);
		  return 1;
	  }
	  size = strtoul(argv[3], &endp, 0);
	  if (*endp != '\0') {
		  fprintf(stderr, usage);
		  return 1;
	  }
	  end = start + size;
	  while (1) {
		  for (pos = start; pos < end; ) {
			  long bread, bwritten = 0;
			  if (lseek(fd, pos, SEEK_SET) < 0) {
				  perror("lseek");
				  return 1;
			  }
			  bread = read(0, buf, sizeof(buf) < end - pos ?
					       sizeof(buf) : end - pos);
			  if (bread < 0) {
				  perror("read");
				  return 1;
			  }
			  if (bread == 0)
				  return 0;
			  while (bwritten < bread) {
				  long this;
				  this = write(fd, buf + bwritten,
					       bread - bwritten);
				  if (this < 0) {
					  perror("write");
					  return 1;
				  }
				  bwritten += this;
				  pos += bwritten;
			  }
		  }
	  }
  }
repro.sh::
  #!/bin/bash
  set -e
  set -x
  sysctl -w vm.dirty_expire_centisecs=300000
  sysctl -w vm.dirty_writeback_centisecs=300000
  sysctl -w vm.dirtytime_expire_seconds=300000
  echo 3 > /proc/sys/vm/drop_caches
  TEST=/sys/fs/cgroup/test
  A=$TEST/A
  B=$TEST/B
  mkdir -p $A $B
  echo "+memory +io" > $TEST/cgroup.subtree_control
  echo $((1<<30)) > $A/memory.high
  echo $((32<<30)) > $B/memory.high
  rm -f testfile
  touch testfile
  fallocate -l 4G testfile
  echo "Starting B"
  (echo $BASHPID > $B/cgroup.procs
   pv -q --rate-limit 70M < /dev/urandom | ./write-range testfile $((2<<30)) $((2<<30))) &
  echo "Waiting 10s to ensure B claims the testfile inode"
  sleep 5
  sync
  sleep 5
  sync
  echo "Starting A"
  (echo $BASHPID > $A/cgroup.procs
   pv < /dev/urandom | ./write-range testfile 0 $((2<<30)))
v2: Added comments explaining why the specific intervals are being used.
v3: Use 0 @nr when calling cgroup_writeback_by_id() to use best-effort
    flushing while avoding possible livelocks.
v4: Use get_jiffies_64() and time_before/after64() instead of raw
    jiffies_64 and arthimetic comparisons as suggested by Jan.
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/memcontrol.c | 134 | ||||
| -rw-r--r-- | mm/page-writeback.c | 4 | 
2 files changed, 138 insertions, 0 deletions
| diff --git a/mm/memcontrol.c b/mm/memcontrol.c index cdbb7a84cb6e..89b65f5ca634 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -87,6 +87,10 @@ int do_swap_account __read_mostly;  #define do_swap_account		0  #endif +#ifdef CONFIG_CGROUP_WRITEBACK +static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); +#endif +  /* Whether legacy memory+swap accounting is active */  static bool do_memsw_account(void)  { @@ -4145,6 +4149,127 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,  	}  } +/* + * Foreign dirty flushing + * + * There's an inherent mismatch between memcg and writeback.  The former + * trackes ownership per-page while the latter per-inode.  This was a + * deliberate design decision because honoring per-page ownership in the + * writeback path is complicated, may lead to higher CPU and IO overheads + * and deemed unnecessary given that write-sharing an inode across + * different cgroups isn't a common use-case. + * + * Combined with inode majority-writer ownership switching, this works well + * enough in most cases but there are some pathological cases.  For + * example, let's say there are two cgroups A and B which keep writing to + * different but confined parts of the same inode.  B owns the inode and + * A's memory is limited far below B's.  A's dirty ratio can rise enough to + * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid + * triggering background writeback.  A will be slowed down without a way to + * make writeback of the dirty pages happen. + * + * Conditions like the above can lead to a cgroup getting repatedly and + * severely throttled after making some progress after each + * dirty_expire_interval while the underyling IO device is almost + * completely idle. + * + * Solving this problem completely requires matching the ownership tracking + * granularities between memcg and writeback in either direction.  However, + * the more egregious behaviors can be avoided by simply remembering the + * most recent foreign dirtying events and initiating remote flushes on + * them when local writeback isn't enough to keep the memory clean enough. + * + * The following two functions implement such mechanism.  When a foreign + * page - a page whose memcg and writeback ownerships don't match - is + * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning + * bdi_writeback on the page owning memcg.  When balance_dirty_pages() + * decides that the memcg needs to sleep due to high dirty ratio, it calls + * mem_cgroup_flush_foreign() which queues writeback on the recorded + * foreign bdi_writebacks which haven't expired.  Both the numbers of + * recorded bdi_writebacks and concurrent in-flight foreign writebacks are + * limited to MEMCG_CGWB_FRN_CNT. + * + * The mechanism only remembers IDs and doesn't hold any object references. + * As being wrong occasionally doesn't matter, updates and accesses to the + * records are lockless and racy. + */ +void mem_cgroup_track_foreign_dirty_slowpath(struct page *page, +					     struct bdi_writeback *wb) +{ +	struct mem_cgroup *memcg = page->mem_cgroup; +	struct memcg_cgwb_frn *frn; +	u64 now = get_jiffies_64(); +	u64 oldest_at = now; +	int oldest = -1; +	int i; + +	/* +	 * Pick the slot to use.  If there is already a slot for @wb, keep +	 * using it.  If not replace the oldest one which isn't being +	 * written out. +	 */ +	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { +		frn = &memcg->cgwb_frn[i]; +		if (frn->bdi_id == wb->bdi->id && +		    frn->memcg_id == wb->memcg_css->id) +			break; +		if (time_before64(frn->at, oldest_at) && +		    atomic_read(&frn->done.cnt) == 1) { +			oldest = i; +			oldest_at = frn->at; +		} +	} + +	if (i < MEMCG_CGWB_FRN_CNT) { +		/* +		 * Re-using an existing one.  Update timestamp lazily to +		 * avoid making the cacheline hot.  We want them to be +		 * reasonably up-to-date and significantly shorter than +		 * dirty_expire_interval as that's what expires the record. +		 * Use the shorter of 1s and dirty_expire_interval / 8. +		 */ +		unsigned long update_intv = +			min_t(unsigned long, HZ, +			      msecs_to_jiffies(dirty_expire_interval * 10) / 8); + +		if (time_before64(frn->at, now - update_intv)) +			frn->at = now; +	} else if (oldest >= 0) { +		/* replace the oldest free one */ +		frn = &memcg->cgwb_frn[oldest]; +		frn->bdi_id = wb->bdi->id; +		frn->memcg_id = wb->memcg_css->id; +		frn->at = now; +	} +} + +/* issue foreign writeback flushes for recorded foreign dirtying events */ +void mem_cgroup_flush_foreign(struct bdi_writeback *wb) +{ +	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); +	unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10); +	u64 now = jiffies_64; +	int i; + +	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { +		struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i]; + +		/* +		 * If the record is older than dirty_expire_interval, +		 * writeback on it has already started.  No need to kick it +		 * off again.  Also, don't start a new one if there's +		 * already one in flight. +		 */ +		if (time_after64(frn->at, now - intv) && +		    atomic_read(&frn->done.cnt) == 1) { +			frn->at = 0; +			cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0, +					       WB_REASON_FOREIGN_FLUSH, +					       &frn->done); +		} +	} +} +  #else	/* CONFIG_CGROUP_WRITEBACK */  static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) @@ -4661,6 +4786,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)  	struct mem_cgroup *memcg;  	unsigned int size;  	int node; +	int __maybe_unused i;  	size = sizeof(struct mem_cgroup);  	size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); @@ -4704,6 +4830,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void)  #endif  #ifdef CONFIG_CGROUP_WRITEBACK  	INIT_LIST_HEAD(&memcg->cgwb_list); +	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) +		memcg->cgwb_frn[i].done = +			__WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);  #endif  	idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);  	return memcg; @@ -4833,7 +4962,12 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css)  static void mem_cgroup_css_free(struct cgroup_subsys_state *css)  {  	struct mem_cgroup *memcg = mem_cgroup_from_css(css); +	int __maybe_unused i; +#ifdef CONFIG_CGROUP_WRITEBACK +	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) +		wb_wait_for_completion(&memcg->cgwb_frn[i].done); +#endif  	if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)  		static_branch_dec(&memcg_sockets_enabled_key); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 1804f64ff43c..50055d2e4ea8 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1667,6 +1667,8 @@ static void balance_dirty_pages(struct bdi_writeback *wb,  		if (unlikely(!writeback_in_progress(wb)))  			wb_start_background_writeback(wb); +		mem_cgroup_flush_foreign(wb); +  		/*  		 * Calculate global domain's pos_ratio and select the  		 * global dtc by default. @@ -2427,6 +2429,8 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)  		task_io_account_write(PAGE_SIZE);  		current->nr_dirtied++;  		this_cpu_inc(bdp_ratelimits); + +		mem_cgroup_track_foreign_dirty(page, wb);  	}  } | 
