From 3a2e9a5a2afc1a2d2c548b8987f133235cebe933 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 23 Sep 2009 21:56:00 +0800 Subject: writeback: balance_dirty_pages() shall write more than dirtied pages Some filesystem may choose to write much more than ratelimit_pages before calling balance_dirty_pages_ratelimited_nr(). So it is safer to determine number to write based on real number of dirtied pages. Otherwise it is possible that loop { btrfs_file_write(): dirty 1024 pages balance_dirty_pages(): write up to 48 pages (= ratelimit_pages * 1.5) } in which the writeback rate cannot keep up with dirty rate, and the dirty pages go all the way beyond dirty_thresh. The increased write_chunk may make the dirtier more bumpy. So filesystems shall be take care not to dirty too much at a time (eg. > 4MB) without checking the ratelimit. Signed-off-by: Wu Fengguang Acked-by: Peter Zijlstra Signed-off-by: Jens Axboe --- mm/page-writeback.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5f378dd58802..cbd4cba468bd 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -44,12 +44,15 @@ static long ratelimit_pages = 32; /* * When balance_dirty_pages decides that the caller needs to perform some * non-background writeback, this is how many pages it will attempt to write. - * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably + * It should be somewhat larger than dirtied pages to ensure that reasonably * large amounts of I/O are submitted. */ -static inline long sync_writeback_pages(void) +static inline long sync_writeback_pages(unsigned long dirtied) { - return ratelimit_pages + ratelimit_pages / 2; + if (dirtied < ratelimit_pages) + dirtied = ratelimit_pages; + + return dirtied + dirtied / 2; } /* The following parameters are exported via /proc/sys/vm */ @@ -477,7 +480,8 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, * If we're over `background_thresh' then pdflush is woken to perform some * writeout. */ -static void balance_dirty_pages(struct address_space *mapping) +static void balance_dirty_pages(struct address_space *mapping, + unsigned long write_chunk) { long nr_reclaimable, bdi_nr_reclaimable; long nr_writeback, bdi_nr_writeback; @@ -485,7 +489,6 @@ static void balance_dirty_pages(struct address_space *mapping) unsigned long dirty_thresh; unsigned long bdi_thresh; unsigned long pages_written = 0; - unsigned long write_chunk = sync_writeback_pages(); unsigned long pause = 1; struct backing_dev_info *bdi = mapping->backing_dev_info; @@ -640,9 +643,10 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, p = &__get_cpu_var(bdp_ratelimits); *p += nr_pages_dirtied; if (unlikely(*p >= ratelimit)) { + ratelimit = sync_writeback_pages(*p); *p = 0; preempt_enable(); - balance_dirty_pages(mapping); + balance_dirty_pages(mapping, ratelimit); return; } preempt_enable(); -- cgit v1.2.3-70-g09d2 From d3ddec7635b6fb37cb49e3553bdeea59642be653 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 23 Sep 2009 20:33:40 +0800 Subject: writeback: stop background writeback when below background threshold Treat bdi_start_writeback(0) as a special request to do background write, and stop such work when we are below the background dirty threshold. Also simplify the (nr_pages <= 0) checks. Since we already pass in nr_pages=LONG_MAX for WB_SYNC_ALL and background writes, we don't need to worry about it being decreased to zero. Reported-by: Richard Kennedy CC: Jan Kara Acked-by: Peter Zijlstra Signed-off-by: Wu Fengguang Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 28 +++++++++++++++++----------- mm/page-writeback.c | 6 +++--- 2 files changed, 20 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index c59d6737036c..476be9b10881 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -41,8 +41,9 @@ struct wb_writeback_args { long nr_pages; struct super_block *sb; enum writeback_sync_modes sync_mode; - int for_kupdate; - int range_cyclic; + int for_kupdate:1; + int range_cyclic:1; + int for_background:1; }; /* @@ -257,6 +258,15 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) .range_cyclic = 1, }; + /* + * We treat @nr_pages=0 as the special case to do background writeback, + * ie. to sync pages until the background dirty threshold is reached. + */ + if (!nr_pages) { + args.nr_pages = LONG_MAX; + args.for_background = 1; + } + bdi_alloc_queue_work(bdi, &args); } @@ -720,20 +730,16 @@ static long wb_writeback(struct bdi_writeback *wb, for (;;) { /* - * Don't flush anything for non-integrity writeback where - * no nr_pages was given + * Stop writeback when nr_pages has been consumed */ - if (!args->for_kupdate && args->nr_pages <= 0 && - args->sync_mode == WB_SYNC_NONE) + if (args->nr_pages <= 0) break; /* - * If no specific pages were given and this is just a - * periodic background writeout and we are below the - * background dirty threshold, don't do anything + * For background writeout, stop when we are below the + * background dirty threshold */ - if (args->for_kupdate && args->nr_pages <= 0 && - !over_bground_thresh()) + if (args->for_background && !over_bground_thresh()) break; wbc.more_io = 0; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index cbd4cba468bd..3c78fc316202 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -593,10 +593,10 @@ static void balance_dirty_pages(struct address_space *mapping, * background_thresh, to keep the amount of dirty memory low. */ if ((laptop_mode && pages_written) || - (!laptop_mode && ((nr_writeback = global_page_state(NR_FILE_DIRTY) - + global_page_state(NR_UNSTABLE_NFS)) + (!laptop_mode && ((global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS)) > background_thresh))) - bdi_start_writeback(bdi, nr_writeback); + bdi_start_writeback(bdi, 0); } void set_page_dirty_balance(struct page *page, int page_mkwrite) -- cgit v1.2.3-70-g09d2 From 5b0830cb9085f4b69f9d57d7f3aaff322ffbec26 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 23 Sep 2009 19:37:09 +0200 Subject: writeback: get rid to incorrect references to pdflush in comments Signed-off-by: Jens Axboe --- fs/buffer.c | 10 +++++----- fs/fs-writeback.c | 5 +---- mm/page-writeback.c | 8 ++++---- mm/shmem.c | 5 +++-- mm/vmscan.c | 8 ++++---- 5 files changed, 17 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/fs/buffer.c b/fs/buffer.c index 90a98865b0cc..fc22b4504087 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -274,7 +274,7 @@ void invalidate_bdev(struct block_device *bdev) } /* - * Kick pdflush then try to free up some ZONE_NORMAL memory. + * Kick the writeback threads then try to free up some ZONE_NORMAL memory. */ static void free_more_memory(void) { @@ -1699,9 +1699,9 @@ static int __block_write_full_page(struct inode *inode, struct page *page, /* * If it's a fully non-blocking write attempt and we cannot * lock the buffer then redirty the page. Note that this can - * potentially cause a busy-wait loop from pdflush and kswapd - * activity, but those code paths have their own higher-level - * throttling. + * potentially cause a busy-wait loop from writeback threads + * and kswapd activity, but those code paths have their own + * higher-level throttling. */ if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { lock_buffer(bh); @@ -3191,7 +3191,7 @@ void block_sync_page(struct page *page) * still running obsolete flush daemons, so we terminate them here. * * Use of bdflush() is deprecated and will be removed in a future kernel. - * The `pdflush' kernel threads fully replace bdflush daemons and this call. + * The `flush-X' kernel threads fully replace bdflush daemons and this call. */ SYSCALL_DEFINE2(bdflush, int, func, long, data) { diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 15e375bf93e6..15944f754e15 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -320,7 +320,7 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t) * For inodes being constantly redirtied, dirtied_when can get stuck. * It _appears_ to be in the future, but is actually in distant past. * This test is necessary to prevent such wrapped-around relative times - * from permanently stopping the whole pdflush writeback. + * from permanently stopping the whole bdi writeback. */ ret = ret && time_before_eq(inode->dirtied_when, jiffies); #endif @@ -1085,9 +1085,6 @@ EXPORT_SYMBOL(__mark_inode_dirty); * If older_than_this is non-NULL, then only write out inodes which * had their first dirtying at a time earlier than *older_than_this. * - * If we're a pdlfush thread, then implement pdflush collision avoidance - * against the entire list. - * * If `bdi' is non-zero then we're being asked to writeback a specific queue. * This function assumes that the blockdev superblock's inodes are backed by * a variety of queues, so all inodes are searched. For other superblocks, diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3c78fc316202..8bef063125b1 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -58,7 +58,7 @@ static inline long sync_writeback_pages(unsigned long dirtied) /* The following parameters are exported via /proc/sys/vm */ /* - * Start background writeback (via pdflush) at this percentage + * Start background writeback (via writeback threads) at this percentage */ int dirty_background_ratio = 10; @@ -477,8 +477,8 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force * the caller to perform writeback if the system is over `vm_dirty_ratio'. - * If we're over `background_thresh' then pdflush is woken to perform some - * writeout. + * If we're over `background_thresh' then the writeback threads are woken to + * perform some writeout. */ static void balance_dirty_pages(struct address_space *mapping, unsigned long write_chunk) @@ -582,7 +582,7 @@ static void balance_dirty_pages(struct address_space *mapping, bdi->dirty_exceeded = 0; if (writeback_in_progress(bdi)) - return; /* pdflush is already working this queue */ + return; /* * In laptop mode, we wait until hitting the higher threshold before diff --git a/mm/shmem.c b/mm/shmem.c index b206a7a32e2a..aa9481166aae 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1046,8 +1046,9 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) * sync from ever calling shmem_writepage; but a stacking filesystem * may use the ->writepage of its underlying filesystem, in which case * tmpfs should write out to swap only in response to memory pressure, - * and not for pdflush or sync. However, in those cases, we do still - * want to check if there's a redundant swappage to be discarded. + * and not for the writeback threads or sync. However, in those cases, + * we do still want to check if there's a redundant swappage to be + * discarded. */ if (wbc->for_reclaim) swap = get_swap_page(); diff --git a/mm/vmscan.c b/mm/vmscan.c index 613e89f471d9..359c3c57ef85 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1709,10 +1709,10 @@ static void shrink_zones(int priority, struct zonelist *zonelist, * * If the caller is !__GFP_FS then the probability of a failure is reasonably * high - the zone may be full of dirty or under-writeback pages, which this - * caller can't do much about. We kick pdflush and take explicit naps in the - * hope that some of these pages can be written. But if the allocating task - * holds filesystem locks which prevent writeout this might not work, and the - * allocation attempt will fail. + * caller can't do much about. We kick the writeback threads and take explicit + * naps in the hope that some of these pages can be written. But if the + * allocating task holds filesystem locks which prevent writeout this might not + * work, and the allocation attempt will fail. * * returns: 0, if no pages reclaimed * else, the number of pages reclaimed -- cgit v1.2.3-70-g09d2